### Setup model first

### Import

In [None]:
%%capture
!pip install transformers tokenizers
import torch
from torch import nn
from typing import List
import torch.nn.functional as F
from transformers import DistilBertTokenizer, AutoTokenizer, AutoModelWithLMHead, DistilBertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
import logging
import os
from functools import lru_cache
from tokenizers import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing
#import pytorch_lightning as pl
from torch.utils.data import DataLoader, Dataset
import pandas as pd
from argparse import Namespace
from sklearn.metrics import classification_report
torch.__version__


### Tokenizer setup

In [None]:
!mkdir -p tokenizer
## load pretrained tokenizer information
def setup_tokenizer():
  tokenizer = AutoTokenizer.from_pretrained('distilroberta-base')
  tokenizer.save_pretrained("tokenizer")

setup_tokenizer()
class TokenizersCollateFn:
    def __init__(self, max_tokens=512):

        ## RoBERTa uses BPE tokenizer similar to GPT
        t = ByteLevelBPETokenizer(
            "tokenizer/vocab.json",
            "tokenizer/merges.txt"
        )
        t._tokenizer.post_processor = BertProcessing(
            ("</s>", t.token_to_id("</s>")),
            ("<s>", t.token_to_id("<s>")),
        )
        t.enable_truncation(max_tokens)
        t.enable_padding(length=max_tokens, pad_id=t.token_to_id("<pad>"))
        self.tokenizer = t

    def __call__(self, batch):
        encoded = self.tokenizer.encode_batch([x[0] for x in batch])
        sequences_padded = torch.tensor([enc.ids for enc in encoded])
        attention_masks_padded = torch.tensor([enc.attention_mask for enc in encoded])
        labels = torch.tensor([x[1] for x in batch])
        
        return (sequences_padded, attention_masks_padded), labels
## emotion labels
label2int = {
  "sadness": 0,
  "joy": 1,
  "love": 2,
  "anger": 3,
  "fear": 4,
  "surprise": 5
}

emotions = [ "sadness", "joy", "love", "anger", "fear", "surprise"]



### Model define
#### Train from scratch is fine, takes < 20 minutes

In [None]:
# from https://github.com/digantamisra98/Mish/blob/b5f006660ac0b4c46e2c6958ad0301d7f9c59651/Mish/Torch/mish.py
@torch.jit.script
def mish(input):
    return input * torch.tanh(F.softplus(input))
  
class Mish(nn.Module):
    def forward(self, input):
        return mish(input)


class EmoModel(nn.Module):
    def __init__(self, base_model, n_classes, base_model_output_size=768, dropout=0.05):
        super().__init__()
        self.base_model = base_model
        
        self.classifier = nn.Sequential(
            nn.Dropout(dropout),
            nn.Linear(base_model_output_size, base_model_output_size),
            Mish(),
            nn.Dropout(dropout),
            nn.Linear(base_model_output_size, n_classes)
        )
        
        for layer in self.classifier:
            if isinstance(layer, nn.Linear):
                layer.weight.data.normal_(mean=0.0, std=0.02)
                if layer.bias is not None:
                    layer.bias.data.zero_()

    def forward(self, input_, *args):
        X, attention_mask = input_
        hidden_states = self.base_model(X, attention_mask=attention_mask)
        
        # maybe do some pooling / RNNs... go crazy here!
        
        # use the <s> representation
        return self.classifier(hidden_states[0][:, 0, :])

In [None]:
def get_model():
  model = EmoModel(AutoModelWithLMHead.from_pretrained("distilroberta-base").base_model, 2)
  return model
get_model().classifier

### Dataset 

In [None]:
class EmoDataset(Dataset):
    def __init__(self, content):
        super().__init__()
        self.content = content

    def __getitem__(self, idx):
        # (string, [V, A])
        return self.content[idx][0], self.content[idx][1]

    def __len__(self):
        return len(self.content)

### load files

In [None]:
!wget -q https://www.dropbox.com/s/e9bxb4qx19be6nn/face_with_va.csv

In [None]:
!ls

In [None]:
face_va = pd.read_csv("face_with_va.csv")
face_va.head()

#### file dict that fixes data leak, aka, exclude the emoji from the text itself
#### checkout file_dict_fix_structure in drive?

In [None]:
def read_pkl(fname):
  import pickle
  with open(fname, "rb") as f:
    ret = pickle.load(f)
    f.close()
  return ret

In [None]:
!wget -q https://www.dropbox.com/s/huubtcxrs988ufn/file_dict_66_fix_data_leak.pkl

In [None]:
file_dict = read_pkl("file_dict_66_fix_data_leak.pkl")

In [None]:
file_dict.keys()

In [None]:
len(file_dict.keys())

In [None]:
file_dict['tiredface'][0]

In [None]:
"check file structure of new file_dict"
for k, v in file_dict.items():
  assert isinstance(k, str)
  for vv in v:
    assert isinstance(vv, str), "%s %s %s" % (k, type(v), v)

In [None]:
"key -> [V,A]"
FVA = {k:[v,a] for k, v, a in zip(face_va['face'], face_va['V_norm'], face_va['A_norm'])}

In [None]:
face_va['face']

In [None]:
"check FVA, passed"
for k, v in FVA.items():
  assert isinstance(k, str)
  assert isinstance(v[0], float)
  assert isinstance(v[1], float)

In [None]:
FVA['star-struck']
#file_dict['starstruck'][0]

In [None]:
"stupid as always"

"create DS"
"(text, [V,A])"
DS = []

for key, posts in file_dict.items():
  # loop thru the post
  for post in posts:
    # (text, [V,A])
    if key == 'relieved':
      ele = (post, FVA['relievedface'])
    elif key == 'starstruck':
      ele = (post, FVA['star-struck'])
    elif key == 'perserveringface':
      ele = (post, FVA['perseveringface'])
    elif key == 'smilingfacewithsunglass':
      ele = (post, FVA['smilingfacewithsunglasses'])
    else:
      ele = (post, FVA[key])

    DS.append(ele)


In [None]:
DS[:2]

In [None]:
for ele in DS:
  assert isinstance(ele[0], str), type(ele[0])
  for ee in ele[1]:
    assert isinstance(ee, float)

In [None]:
"check"
#ds = EmoDataset(plurk_va)
ds = EmoDataset(DS)
ds[0] # (string, [V,A])

### Setup for training

In [None]:
"garbage collect!"
def collect():
  torch.cuda.empty_cache()
  import gc
  print(gc.collect())
collect()

In [None]:
def get_dataloader(content, bs, shuffle):
  return DataLoader(EmoDataset(content), batch_size=bs, shuffle=shuffle, collate_fn=TokenizersCollateFn())

In [None]:
import random
rdx = random.uniform

In [None]:
rdx(0,1)

In [None]:
plurk_train, plurk_val, plurk_test = [], [], []

for ele in DS:
  sp = rdx(0,1)
  if sp > 0.2:
    plurk_train.append(ele)
  elif sp > 0.1:
    plurk_val.append(ele)
  else:
   plurk_test.append(ele)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

In [None]:
# works
bs = 16
trainDL = get_dataloader(plurk_train, bs, True)
valDL = get_dataloader(plurk_val, bs, True)

In [None]:
tmp = next(iter(trainDL))
tmp[0][0].shape, tmp[0][1].shape, tmp[1].shape

### Training

In [None]:
model = get_model()
model.classifier

In [None]:
"move model to gpu first"
model.base_model.to(device)
model.base_model.device, model.classifier.to(device)

In [None]:
"let's just od one first"
epochs = 1

In [None]:
total_length = len(trainDL)
total_length

## Gradually unfreeze
#### freeze all; base lr = 1e-4
### classifiers     lr
#### unfreeze -1   1e-4
#### unfreeze -2   1e-4/2
#### unfreeze -3   1e-4/2
#### unfreeze -4   1e-4/2
#### unfreeze -5   1e-4/2
#### unfreeze all   1e-5


### Given the classifier structure, we did the modifications to our freeze policy
### freeze the base
### unfreeze dropout and mish
### only gradual unfreeze on the Linear heads


In [None]:
"freeze all"
for layer in list(model.parameters()):
  layer.requires_grad = False

In [None]:
model.classifier

In [None]:
for layer in list(model.classifier.parameters())[-2:]:
  layer.requires_grad = True

In [None]:
"unfreeze to -2 -> -4 -> all"
for layer in list(model.classifier.parameters()):
  print(layer.requires_grad)

In [None]:
"we're doing regression!"
criterion = nn.MSELoss()
# optimizer
" train with larger weights to make top layers better?"
lr = 1e-4
optimizer = AdamW(model.parameters(), lr=lr)
optimizer.zero_grad()

In [None]:
train_loss, val_loss = [], []

In [None]:
collect()

#### freeze until -1

In [None]:
epochs

In [None]:
"training loop"
def trainit(epochs):
  print("epoch batches train_loss val_loss")
  "update every 4 mini-batches -> 16*4 = 61"
  for epoch in range(epochs):
    for i, batch_ in enumerate(trainDL):
      # (x, attn), tensor([V, A])
      (x, attn), y = batch_
      "# get val loss first"
      if i % 4 == 3:
        with torch.no_grad():
          model.eval()
          (_x, _attn), _y = next(iter(valDL)) # iterates itself
          _o = model.forward((_x.to(device), _attn.to(device)))
          loss_val = criterion(_o, _y.to(device))
          val_loss.append(loss_val.item())

      "# then get train loss"
      model.train()
      output = model.forward((x.to(device), attn.to(device)))
      loss = criterion(output, y.to(device))
      loss.backward()

      # update once every 4 time (bs * 4 = 64)
      if i % 4 == 3:
        optimizer.step()
        train_loss.append(loss.item())
        # clear the grad, tho the result might be much worsen?
        optimizer.zero_grad()

      if i % 200 == 199:
        print("%d     %.3f     %.4f     %.4f" % (epoch, float((i+1)/total_length), loss.item(), loss_val.item()))

  return loss

In [None]:
"unfreeze until -2"
loss = trainit(epochs)

In [None]:
"training loss"
loss.item()

In [None]:

collect()

In [None]:
from matplotlib import pyplot as plt
def plotit(inp_L, legends=None, y_lim=0.1, figsize=(12,8), title=None):
  plt.figure(figsize=figsize)
  lgnd = []
  for inp in inp_L:
    plt.plot(range(len(inp)), inp)
  
  if legends is not None and isinstance(legends, list):
    plt.legend(legends)
  plt.ylim([0,y_lim])
  if title is not None:
    plt.title(title)
  plt.show()

#### check the training result before we moving on

In [None]:
plotit([train_loss, val_loss], ["train_loss", "val_loss"], y_lim=0.5, title="freezed until -2")

In [None]:
len(train_loss), len(val_loss)

#### save freezed weights just in case we need it later
#### in my drive so that we won't lost our file if it disconnected!

### lower the learning rate!

#### what we need was ...
#### lr
#### optimizer
#### trainit 
#### plotit
#### save weights with filename: arch_1_unfreeze_to_$idx.pth

In [None]:
for layers in list(model.classifier.parameters()):
  layer.requires_grad = True

for layers in list(model.classifier.parameters()):
  print(layer.requires_grad)

In [None]:
lr

In [None]:
print(epochs) # mostly 1

lr = 1e-4/2

optimizer = AdamW(model.parameters(), lr=lr)
optimizer.zero_grad()

loss = trainit(epochs)
loss.item()
collect()
plotit([train_loss, val_loss], ["train_loss", "val_loss"], y_lim=0.5, title="freezed until %d" % (-5))


In [None]:
basedir = Path("your_directory_here")

assert basedir.is_dir()

In [None]:
def save_weights(filename: str):
  assert filename.endswith(".pt")
  assert basedir.is_dir()
  assert not Path(basedir/filename).is_file()
  torch.save(model.state_dict(), basedir/filename)
  !ls -l drive/MyDrive/your_directory_here

In [None]:
save_weights("arch_1_freeze_until_ng4_.pt")

### Last stage: unfreeze all!

In [None]:
lr

In [None]:
"unfreeze all!"
for layer in list(model.parameters()):
  layer.requires_grad = True

for layer in list(model.parameters()):
  assert layer.requires_grad == True

#### use a lower learning rate seems to be safe
#### in order not to ruined the pretrained weight

In [None]:
lr

In [None]:
new_lr = 1e-5
new_lr

In [None]:
"optimizer!"
# optimizer
optimizer = AdamW(model.parameters(), lr=new_lr)
optimizer.zero_grad()


In [None]:
collect()

#### unfreeze + lowered learning rate

In [None]:
loss = trainit(epochs)

In [None]:
loss.item()

#### freeze + unfreeze

In [None]:
plotit([train_loss, val_loss], ["train_loss", "val_loss"], y_lim=0.5, title="after unfreeze")

In [None]:
len(train_loss), len(val_loss)

In [None]:
save_weights("arch1_unfreeze_all.pt")

In [None]:
collect()

In [None]:
loss.item()

### Validation & Visualize

In [None]:
testDL = get_dataloader(plurk_test, bs, True)

In [None]:
test_loss = []
test_val_pred = []

In [None]:
with torch.no_grad():
  model.eval()
  # test set
  for i, batch_ in enumerate(testDL):
    # (x, attn), tensor([V, A])
    (x, attn), y = batch_
    
    # output: [V, A]
    output = model.forward((x.to(device), attn.to(device)))
    loss = criterion(output, y.to(device))

    # test loss
    test_loss.append(loss.item())

    # and collect the prediction result
    # [prediction, label] for future indexing
    test_val_pred.append([output, y])

  # on val set
  for i, batch_ in enumerate(valDL):
    # (x, attn), tensor([V, A])
    (x, attn), y = batch_
    
    test_val_pred.append([output, y])


In [None]:
# one can change the y_lim argument here, or just use matplotlib to do what you want will work, too
plotit([test_loss], ["test"], y_lim=0.02, figsize=(12,8))

In [None]:
plotit([train_loss, val_loss, test_loss], ["train_loss", "val_loss", "test_loss"])

## Scatter plots

In [None]:
def sct(inp_L, legends=None, figsize=(15,10)):
  plt.figure(figsize=figsize)

  for inp in inp_L:
    idxs = inp[0]
    x_arr = inp[1]
    y_arr = inp[2]
    plt.scatter(x_arr, y_arr)
    for lb, x, y in zip(idxs, x_arr, y_arr):
      plt.annotate(lb, xy=(x,y))

  if legends is not None and isinstance(legends, list):
    plt.legend(legends)
  
  plt.show()


### change test_val_pred into predictions and labels
### predictions: list of predicted tensor([V,A])
### labels: list of label tensor([V,A]), useful in indexing (so that we know which of the emotions are more easy to predict)


In [None]:
len(test_val_pred), len(valDL), len(testDL)

In [None]:
"process test_val_pred"
"list of [prediction, label]"
"prediction: [tensor (of bs=16)]"
"tensor (of bs=16): tensor([V,A]"
type(test_val_pred), type(test_val_pred[0]), type(test_val_pred[0][0])

In [None]:
# [predict, label]
test_val_pred[0]

In [None]:
# [V,A]
test_val_pred[0][0]

In [None]:
"list of [V,A]"
_predictions = []
_labels = []

for ele in test_val_pred:
  _predictions.append(ele[0])
  _labels.append(ele[1])


In [None]:
len(_predictions), len(_labels), len(test_val_pred)

In [None]:
_predictions[0]

In [None]:
"dict of v_list, a_list"
predictions = {"V_norm": [], "A_norm": []}

# _p: batch of tensor [V,A]
for _p in _predictions:
  # each tensor in batch
  for _t in _p:
    _v, _a = _t[0], _t[1]
    predictions["V_norm"].append(float(_v))
    predictions["A_norm"].append(float(_a))


In [None]:
len(test_val_pred),len(test_val_pred[0]),len(test_val_pred[0][0])

In [None]:
len(_predictions)

In [None]:
329*16

In [None]:
len(testDL), len(valDL)

In [None]:
# since we resample valDL, the length won't simply = len(val_pred_test)*bs
len(predictions["V_norm"])

In [None]:
"Also"
"dict of v_list, a_list"
labels = {"V_norm": [], "A_norm": []}

# _p: batch of tensor [V,A]
for _p in _labels:
  # each tensor in batch
  for _t in _p:
    _v, _a = _t[0], _t[1]
    labels["V_norm"].append(float(_v))
    labels["A_norm"].append(float(_a))

len(labels["V_norm"])

In [None]:
collect()

In [None]:
face_va.head()

In [None]:
"USE labels"
"get index and we're done"
"refer to label -> face_va"


In [None]:
list(range(len(labels["A_norm"])))[:10]

In [None]:
face_va.head()

## indicies done !

In [None]:
"answers!"
V = list(face_va["V_norm"])
A = list(face_va["A_norm"])

In [None]:
"use labels to do indexing"
tt = labels["V_norm"][0]

In [None]:
tt

In [None]:
"V.index(tt) # this will throw and error"
"-> use string to do the index"
"map the answers"
V_s = ["%.6f" % (ele) for ele in V]
A_s = ["%.6f" % (ele) for ele in A]

"map the labels as well"
"(we only need one TBH, the other is used to check the answer)"
labels_V_s = ["%.6f"%(ele) for ele in labels["V_norm"]]
labels_A_s = ["%.6f"%(ele) for ele in labels["A_norm"]]

len(labels_V_s) == len(labels_A_s)

In [None]:
"we use V as index, since 0.920755 appeared twice in A"
idxs = [V_s.index(ele) for ele in labels_V_s]

In [None]:
plt.figure(figsize=(12,8))
x_arr = V
y_arr = A
plt.scatter(x_arr,y_arr)

for lb, x, y in zip(list(range(len(x_arr))), x_arr, y_arr):
  plt.annotate(lb, xy=(x,y))

plt.legend(["labels"])
plt.show()

In [None]:
face_va.loc[[1]]

In [None]:
face_va[["face", "V_norm", "A_norm"]].loc[[3]]

In [None]:
plt.figure(figsize=(12,8))
x_arr = predictions["V_norm"][:60]
y_arr = predictions["A_norm"][:60]
plt.scatter(x_arr,y_arr)

for lb, x, y in zip(idxs, x_arr, y_arr):
  plt.annotate(lb, xy=(x,y))

plt.legend(["predictions (part of)"])
plt.show()

In [None]:
face_va[["face","V_norm","A_norm"]].loc[[18, 15, 42]]

## Combined!

In [None]:
plt.figure(figsize=(15,10))

x_arr = predictions["V_norm"][:60]
y_arr = predictions["A_norm"][:60]
plt.scatter(x_arr,y_arr)

for lb, x, y in zip(idxs, x_arr, y_arr):
  plt.annotate(lb, xy=(x,y))


x_arr = V
y_arr = A
plt.scatter(x_arr,y_arr)

for lb, x, y in zip(list(range(len(x_arr))), x_arr, y_arr):
  plt.annotate(lb, xy=(x,y))

plt.legend(["predictions (part of)", "labels"])
plt.title("predictions vs labels")
plt.show()

In [None]:
plt.figure(figsize=(12,8))
x_arr = V
y_arr = A
plt.scatter(x_arr,y_arr)

for lb, x, y in zip(list(range(len(x_arr))), x_arr, y_arr):
  plt.annotate(lb, xy=(x,y))

plt.legend(["labels"])
plt.show()

In [None]:
"ALL"
plt.figure(figsize=(21,14))

x_arr = predictions["V_norm"]
y_arr = predictions["A_norm"]
plt.scatter(x_arr,y_arr)

for lb, x, y in zip(idxs, x_arr, y_arr):
  plt.annotate(lb, xy=(x,y))


x_arr = V
y_arr = A
plt.scatter(x_arr,y_arr)

for lb, x, y in zip(list(range(len(x_arr))), x_arr, y_arr):
  plt.annotate(lb, xy=(x,y))

plt.legend(["predictions", "labels"])
plt.title("ALL predictions vs labels")
plt.show()

In [None]:
face_va[["face", "V_norm", "A_norm"]].loc[[27, 23, 30]]

In [None]:
plt.figure(figsize=(21,14))
x_arr = V
y_arr = A
plt.scatter(x_arr,y_arr)

for lb, x, y in zip(list(range(len(x_arr))), x_arr, y_arr):
  plt.annotate(lb, xy=(x,y))

plt.legend(["labels"])
plt.show()

## Notice that we didn't have all 74 emotions in our dataset
### , so, don't print the answers that are not in the scope

In [None]:
len(FVA.keys())

In [None]:
len(file_dict.keys())

In [None]:
TO_PRINT = list(file_dict.keys())
face_va[["face"]]#.loc[TO_PRINT]

In [None]:
TO_PRINT

In [None]:
refff = list(face_va["face"])
refff[:3]

#### If there we're key error, just keep running the next cell and you're still fine

In [None]:
face_to_print = []
for ele in TO_PRINT:
  if ele == "perserveringface":
    ele = "perseveringface"
  elif ele == "tired" or ele == "relieved":
    ele += "face"
  elif ele == "starstruck":
    ele = "star-struck"
    
  face_to_print.append(refff.index(ele))
len(face_to_print)

In [None]:
plt.figure(figsize=(12,8))
x_arr = [V[e] for e in face_to_print]
y_arr = [A[e] for e in face_to_print]

plt.scatter(x_arr,y_arr)

for lb, x, y in zip(face_to_print, x_arr, y_arr):
  plt.annotate(lb, xy=(x,y))

plt.legend(["labels"])
plt.title("Face that appeared in our DS")
plt.show()

In [None]:
"face that appeared + part of prediction"
"ALL"
plt.figure(figsize=(15,10))

x_arr = predictions["V_norm"][:60]
y_arr = predictions["A_norm"][:60]
plt.scatter(x_arr,y_arr)

for lb, x, y in zip(idxs, x_arr, y_arr):
  plt.annotate(lb, xy=(x,y))

x_arr = [V[e] for e in face_to_print]
y_arr = [A[e] for e in face_to_print]

plt.scatter(x_arr,y_arr)

for lb, x, y in zip(face_to_print, x_arr, y_arr):
  plt.annotate(lb, xy=(x,y))

plt.legend(["part of predictions", "labels"])
plt.title("Face that appeared in our DS")
plt.show()


In [None]:
"查表: 查答案的座標"
face_va[["face", "V_norm", "A_norm"]].loc[[36, 45, 57, 0]]