**Задача 3. Модель автокодировщика**

Провести анализ модели автокодировщика (не вариационного) для выборки Twitter (эмбединги предложений). Требуется сравнить качество востановления предложения в зависимости от:

```
размера слоя;
числа слоев;
параметра dropout;
добавления BatchNorm;
размера словаря;
токенизатора - дополнительное задание (со звездочкой.```



In [None]:
! pip install transformers
! pip install torchmetrics

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import pandas as pd
import numpy as np
from transformers import AutoTokenizer
import torch
import torchmetrics

from torch.utils.tensorboard import SummaryWriter

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cpu'

In [None]:
dataset = pd.read_csv('/content/drive/MyDrive/Study/8 sem/MachineLearning/HW1/task3/tweets.csv')
dataset

Unnamed: 0,tag,message
0,0.0,is so sad for my APL friend.............
1,0.0,I missed the New Moon trailer...
2,1.0,omg its already 7:30 :O
3,0.0,.. Omgaga. Im sooo im gunna CRy. I've been at...
4,0.0,i think mi bf is cheating on me!!! T_T
...,...,...
1578609,1.0,Zzzzzz.... Finally! Night tweeters!
1578610,1.0,"Zzzzzzz, sleep well people"
1578611,0.0,ZzzZzZzzzZ... wait no I have homework.
1578612,0.0,"ZzZzzzZZZZzzz meh, what am I doing up again?"


In [None]:
dataset = dataset[dataset[['tag', 'message']].notnull().all(1)]
dataset.head()

Unnamed: 0,tag,message
0,0.0,is so sad for my APL friend.............
1,0.0,I missed the New Moon trailer...
2,1.0,omg its already 7:30 :O
3,0.0,.. Omgaga. Im sooo im gunna CRy. I've been at...
4,0.0,i think mi bf is cheating on me!!! T_T


In [None]:
dataset = dataset.sample(40000, random_state=42)
train_mask = np.random.rand(len(dataset), ) < 0.8
dataset_train = dataset[train_mask]
dataset_test = dataset[~train_mask]

In [None]:
dataset_train.sample(5, random_state=42)

Unnamed: 0,tag,message
1531989,0.0,sitting rubbing my eye from hay fever
772286,0.0,@ElZorro: &quot;&quot;UB40-&quot;Bring Me Your...
1060788,0.0,Must attempt to rest before I start cleaning t...
1292702,0.0,working at office even Sunday.
1106752,0.0,No tennis stupid courts are to wet


In [None]:
train = dataset_train['message'].to_list()
test = dataset_test['message'].to_list()

In [None]:
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/LaBSE", verbose=False)
tokenizer.vocab_size


501153

In [None]:
tokenized_train_in_num = tokenizer(train, return_tensors='pt', max_length=64, padding=True, truncation=True)['input_ids']
tokenized_test_in_num = tokenizer(test, return_tensors='pt', max_length=64, padding=True, truncation=True)['input_ids']

In [None]:
unk_token_id, pad_token_id = tokenizer.unk_token_id, tokenizer.pad_token_id
unk_token_id, pad_token_id

(100, 0)

Очень большой словарь, проредим и переиндексируем (оставляем токены из train)

In [None]:
vocab_old_new = {pad_token_id:0, unk_token_id:1}
for token in tokenized_train_in_num.unique().tolist():
  if token not in vocab_old_new:
    vocab_old_new[token] = len(vocab_old_new.values())

print(vocab_old_new)

{0: 0, 100: 1, 101: 2, 102: 3, 106: 4, 108: 5, 109: 6, 110: 7, 111: 8, 112: 9, 113: 10, 114: 11, 115: 12, 116: 13, 117: 14, 118: 15, 119: 16, 120: 17, 121: 18, 122: 19, 123: 20, 124: 21, 125: 22, 126: 23, 127: 24, 128: 25, 129: 26, 130: 27, 131: 28, 132: 29, 134: 30, 136: 31, 137: 32, 138: 33, 139: 34, 140: 35, 141: 36, 142: 37, 143: 38, 144: 39, 145: 40, 146: 41, 147: 42, 148: 43, 149: 44, 150: 45, 151: 46, 152: 47, 153: 48, 154: 49, 155: 50, 156: 51, 157: 52, 158: 53, 159: 54, 160: 55, 161: 56, 162: 57, 163: 58, 164: 59, 165: 60, 166: 61, 167: 62, 168: 63, 169: 64, 170: 65, 171: 66, 172: 67, 173: 68, 174: 69, 175: 70, 176: 71, 177: 72, 178: 73, 179: 74, 180: 75, 181: 76, 182: 77, 183: 78, 184: 79, 185: 80, 186: 81, 187: 82, 188: 83, 189: 84, 190: 85, 191: 86, 192: 87, 193: 88, 194: 89, 195: 90, 196: 91, 197: 92, 198: 93, 199: 94, 201: 95, 202: 96, 204: 97, 205: 98, 206: 99, 207: 100, 211: 101, 212: 102, 213: 103, 214: 104, 216: 105, 219: 106, 221: 107, 222: 108, 223: 109, 226: 110, 2

In [None]:
def with_new_vocab(tokenized_data_in_num, vocab_old_new):
  new_tokenized_data = []
  for sent in tokenized_data_in_num:
    new_tokenized_sentence = []
    for num in sent:  
      num = int(num)
      if num in vocab_old_new:
        new_tokenized_sentence.append(int(vocab_old_new[num]))
      else:
        new_tokenized_sentence.append(int(vocab_old_new[unk_token_id]))
    new_tokenized_data.append(new_tokenized_sentence)

  return new_tokenized_data


In [None]:
new_tokenized_train = with_new_vocab(tokenized_train_in_num, vocab_old_new)
new_tokenized_test = with_new_vocab(tokenized_test_in_num, vocab_old_new)

In [None]:
train_dataset = torch.utils.data.TensorDataset(torch.tensor(new_tokenized_train), torch.tensor(new_tokenized_train))
test_dataset = torch.utils.data.TensorDataset(torch.tensor(new_tokenized_test), torch.tensor(new_tokenized_test))

In [None]:
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size = 32)
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size = 32)


In [None]:
next(iter(train_dataloader))[0].shape

torch.Size([32, 64])

In [None]:
def encode_sentence(sentence, tokenizer, vocab_old_new):
  tokenized_sentence = tokenizer.encode(sentence)
  new_tokenized_sentence = []
  for num in tokenized_sentence:  
      num = int(num)
      if num in vocab_old_new:
        new_tokenized_sentence.append(vocab_old_new[num])
      else:
        new_tokenized_sentence.append(vocab_old_new[unk_token_id])
  
  return new_tokenized_sentence

In [None]:
def decode_sentence(tokenized_sentence, tokenizer, vocab_old_new):
  vocab_new_old = dict(map(lambda pair: (pair[1], pair[0]), vocab_old_new.items()))
  old_tokenized_sentence = []
  for token in tokenized_sentence:
    old_tokenized_sentence.append(vocab_new_old[token])

  sentence = tokenizer.decode(old_tokenized_sentence)

  return sentence

In [None]:
decode_sentence(encode_sentence('hello world', tokenizer, vocab_old_new), tokenizer, vocab_old_new)

'[CLS] hello world [SEP]'

# Model

In [None]:
class Encoder(torch.nn.Module):
  def __init__(self, vocab_size, emb_dim=30, hidden_dim=30, p=0):
    super(Encoder, self).__init__()
    self.embedding = torch.nn.Embedding(vocab_size, emb_dim)
    self.lstm = torch.nn.LSTM(emb_dim, hidden_dim, batch_first=True, bidirectional=True, dropout=p)

  def forward(self, input):
    input = self.embedding(input)
    out, h_c = self.lstm(input)
    return out, h_c

In [None]:
class Decoder(torch.nn.Module):
  def __init__(self, vocab_size, encoded_dim, hidden_dim=30, p=0):
    super(Decoder, self).__init__()
    self.lstm = torch.nn.LSTM(encoded_dim, hidden_dim, batch_first=True, bidirectional=True, dropout=p)
    self.linear = torch.nn.Linear(2*hidden_dim, vocab_size)
    self.dropout = torch.nn.Dropout(p)

  def forward(self, encoder_output):
    x, h_c = encoder_output
    out, h_c = self.lstm(x, h_c)
    out = self.dropout(self.linear(out))
    return out

In [None]:
class AutoEncoder(torch.nn.Module):
  def __init__(self, vocab_size, latent_dim=30, emb_dim=30, hidden_dim=30, 
               num_layers=1, p=0, zero_input=True):
    super(AutoEncoder, self).__init__()

    self.num_classes = vocab_size
    
    self.encoder = Encoder(vocab_size, emb_dim, hidden_dim, p)
    self.decoder = Decoder(vocab_size, 2*hidden_dim, hidden_dim, p)

    self.hidden_dim = hidden_dim
    self.latent_dim = latent_dim
    self.emb_dim = emb_dim
    self.num_layers = num_layers
    self.dropout = torch.nn.Dropout(p)
    self.p = p

  def forward(self, input):
    encoder_out = self.encoder(input)
    decoder_out = self.decoder(encoder_out) 
    out = decoder_out.transpose(1, 2)
    return out


In [None]:
def get_acc_loss_pred(model, dataloader, need_predict=True, device='cpu'):
  sum_acc = 0
  preds = None

  for i, (features, labels) in enumerate(dataloader):
    if need_predict and i >= 1: break

    features = features.to(device)
    labels = labels.to(device)
    preds = model(features)

    acc_batch = torchmetrics.functional.accuracy(preds, labels, task='multiclass', num_classes=model.num_classes, ignore_index=0)
    sum_acc += acc_batch

  if need_predict:
    return sum_acc / len(dataloader), torch.argmax(preds, 1)[0].cpu().tolist()
  else:
    return sum_acc / len(dataloader)

In [None]:
def print_info_in_writer(model, writer, train_acc, val_acc, step):
  if writer != None:

    writer.add_scalars(main_tag='TRAIN/accuracy', 
                            tag_scalar_dict={f'vocab_size = {model.num_classes}, latent_dim={model.latent_dim}, emb_dim={model.emb_dim}, hidden_dim={model.hidden_dim}, num_layers={model.num_layers}, drop_p={model.p},': train_acc}, 
                            global_step=step)
    
    writer.add_scalars(main_tag='VAL/accuracy', 
                            tag_scalar_dict={f'vocab_size = {model.num_classes}, latent_dim={model.latent_dim}, emb_dim={model.emb_dim}, hidden_dim={model.hidden_dim}, num_layers={model.num_layers}, drop_p={model.p},': val_acc}, 
                            global_step=step)

In [None]:
def fit(model, train_dataloader, val_dataloader, optimizer,
        max_epochs=25, device=torch.device("cpu"), writer = None):
  model.to(device)
  model.train()

  step = 0
  val_acc, prediction = get_acc_loss_pred(model, val_dataloader, need_predict=True, device=device)
  decoded_prediction = decode_sentence(prediction, tokenizer, vocab_old_new)
  writer.add_text(tag=f'VAL/predictions: vocab_size = {model.num_classes}, latent_dim={model.latent_dim}, emb_dim={model.emb_dim}, hidden_dim={model.hidden_dim}, num_layers={model.num_layers}, drop_p={model.p}', 
                       text_string=decoded_prediction, global_step=0)
  
  for epoch in range(max_epochs):    
    for i, (features, labels) in enumerate(train_dataloader):

      if step % 500 == 0:
        train_acc = get_acc_loss_pred(model, train_dataloader, need_predict=False, device=device)
        val_acc = get_acc_loss_pred(model, val_dataloader, need_predict=False, device=device)

        print_info_in_writer(model, writer, train_acc, val_acc, step)

      features, labels = features.to(device), labels.to(device)
      optimizer.zero_grad()
      preds = model(features)
      loss = torch.nn.functional.cross_entropy(preds, labels, ignore_index=0)
      loss.backward()

      optimizer.step()
      step += 1

    val_acc, prediction = val_acc, prediction = get_acc_loss_pred(model, val_dataloader, need_predict=True, device=device)
    decoded_prediction = decode_sentence(prediction, tokenizer, vocab_old_new)
    writer.add_text(tag=f'VAL/predictions: vocab_size = {model.num_classes}, latent_dim={model.latent_dim}, emb_dim={model.emb_dim}, hidden_dim={model.hidden_dim}, num_layers={model.num_layers}, drop_p={model.p}', 
                        text_string=decoded_prediction, global_step=epoch + 1)

      

In [None]:
params =  [
            {'num_layers': 2, 'hidden_dim': 500, 'drop_p': 0.1},
            {'num_layers': 4, 'hidden_dim': 500, 'drop_p': 0.1},
            {'num_layers': 2, 'hidden_dim': 200, 'drop_p': 0.1},
            {'num_layers': 2, 'hidden_dim': 500, 'drop_p': 0.3}
          ]

In [None]:
! rm -rf './runs'

In [None]:
writer = SummaryWriter()

for param in params:
  model = AutoEncoder(vocab_size=len(vocab_old_new), latent_dim=30, emb_dim=30, hidden_dim=param['hidden_dim'], 
               num_layers=param['num_layers'], p=param['drop_p'], zero_input=True)
  optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
  fit(model, train_dataloader, test_dataloader, optimizer, max_epochs=3, device=device, writer=writer)

writer.close()



In [None]:
%load_ext tensorboard
%tensorboard --logdir ./runs

In [None]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)
! rm temp.zip
! zip -r temp.zip /content/runs
! cp temp.zip "/content/gdrive/MyDrive/Study/8 sem/MachineLearning/HW1/tensorboard3.zip"