# **INSTALL LIBRARIES**

In [1]:
# !pip install transformers

# **LIBRARIES**

In [2]:
import json
import numpy as np
import torch
import torch.nn as nn
import os
# Dataset
from PIL import Image
from torchvision import transforms
from torchvision.io import read_video, read_image
from torch.utils.data import Dataset, DataLoader
# Model
from transformers import AutoModel
from transformers import AutoTokenizer, BertModel, BertTokenizer

# Training parameter
from torch.optim import Adam
# Training process
from tqdm import tqdm
# Metrics
from sklearn.metrics import classification_report

In [3]:
# from google.colab import drive
# drive.mount('/content/drive')

# **SETUP PARAMETERS**

In [4]:
FOLDER = '/data/ECF 2.0/'
save_checkpoint_dir = FOLDER + "checkpoints"
save_model_dir = FOLDER + "model"
save_report_dir = FOLDER + "report"
model_name = "BERT"
tokenizer_name = 'bert-base-cased'
conversation_encoder_name = 'bert-base-cased'
optimizer_name = "BERT"
loss_fn_name = "BCE_Loss"
metrics = {}
output_json = FOLDER + "predict"
file_train = {"subtask_1_text_file": FOLDER + "train/Subtask_1_train.json",
              "subtask_2_text_file": FOLDER + "train/Subtask_2_train.json",
              "video_dir": FOLDER + "train/video_with_audio",
              "max_conversation_length": 32,
              "max_emotion_cause_pairs_length": 128}
file_trial = {"subtask_1_text_file": FOLDER + "trial/Subtask_1_trial.json",
              "subtask_2_text_file": FOLDER + "trial/Subtask_2_trial.json",
              "video_dir": FOLDER + "trial/video_with_audio",
              "max_conversation_length": 32,
              "max_emotion_cause_pairs_length": 128}
subtask = "subtask_1"
format = "ECF 2.0"

# Define token to find subtensor
sta_token = 101
eof_token = 102
max_length = 512

# **UTIL FUNCTIONS**

In [5]:
def read_json_file(file_path):
  with open(file_path, 'r') as f:
    data = json.load(f)
  return data
def find_sub_list_indices(main_list, sub_list):
  start_index = None
  end_index = None
  sub_list = sub_list[0]
  end_sublist = sub_list.index(eof_token)
  sub_list = sub_list[1:end_sublist]
  start_index = main_list.index(sub_list[0])
  end_index = start_index + len(sub_list)
  while main_list[start_index:end_index] != sub_list:
    start_index = main_list.index(sub_list[0], start_index + 1)
    end_index = start_index + len(sub_list)

  return start_index, end_index

# **LOAD MODEL**

In [6]:
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
conversation_encoder = AutoModel.from_pretrained(conversation_encoder_name)

# **TRAIN**

In [7]:
# prompt: create trainer class
class Trainer(object):
  def __init__(self, model, optimizer, loss_fn, metrics=None, epochs=10, save_checkpoint_dir='./', save_model_dir='./', save_report_dir='./'):
    self.model = model
    self.optimizer = optimizer
    self.loss_fn = loss_fn
    self.metrics = metrics

    self.epochs = epochs
    self.save_checkpoint_dir = save_checkpoint_dir
    self.save_model_dir = save_model_dir
    self.save_report_dir = save_report_dir

  def train(self, train_loader, val_loader, epochs):
    best_val_loss = np.inf
    for epoch in range(epochs):
      train_loss, train_metrics = self._train_epoch(train_loader)
      val_loss, val_metrics = self._val_epoch(val_loader)
      if val_loss < best_val_loss:
        best_val_loss = val_loss
        self.save_checkpoint(epoch)
      print(f"Epoch {epoch+1}: Train loss {train_loss:.4f}, Train {train_metrics}, Valid loss {val_loss:.4f}, Valid {val_metrics}")

  def _train_epoch(self, dataset):
    self.model.train()
    train_loss = 0.0
    train_metrics = {}
    for batch in dataset:
      outputs = self.model(batch)
      loss = self.loss_fn(outputs, batch["target"])
      self.optimizer.zero_grad()
      loss.backward()
      self.optimizer.step()
      train_loss += loss.item()
      for metric, fn in self.metrics.items():
        train_metrics[metric] = fn(outputs, batch["target"])
    return train_loss / len(dataset), train_metrics

  def _val_epoch(self, dataset):
    self.model.eval()
    test_loss = 0.0
    test_metrics = {}
    for batch in dataset:
      outputs = self.model(batch)
      loss = self.loss_fn(outputs, batch["target"])
      test_loss += loss.item()
      for metric, fn in self.metrics.items():
        test_metrics[metric] = fn(outputs, batch["target"])
    return test_loss / len(dataset), test_metrics
  def save_checkpoint(self, epoch):
    checkpoint_path = os.path.join(self.save_checkpoint_dir, 'checkpoint_{}.pth'.format(epoch))
    torch.save({
        'model_state_dict': self.model.state_dict(),
        'optimizer_state_dict': self.optimizer.state_dict(),
        'epoch': epoch
    }, checkpoint_path)

# **PREDICTION**

In [8]:
class Predictor(object):
  def __init__(self, model, output_json, subtask=None, batch_size=32):
    self.model = model
    self.output_json = output_json
    self.subtask = subtask
    self.batch_size = batch_size
    self.text_to_number_mapping = {"neutral": 0, 'surprise': 1, 'anger': 2, "disgust": 3, "fear": 4, "joy": 5, "sadness": 6}
    self.number_to_text_mapping = {0: "neutral", 1: 'surprise', 2: 'anger', 3: "disgust", 4: "fear", 5: "joy", 6: "sadness"}
    self.predicted_dataset = None

  def emotion_cause_pairs(self, casual_text, predicted_emotion, casual = True):
    text = str(predicted_emotion[0] + 1) + "_" + str(self.number_to_text_mapping[predicted_emotion[1]])
    if casual == True:
      cause = str(predicted_emotion[2] + 1) + "_" + str(casual_text[predicted_emotion[3]: predicted_emotion[4]])
    else:
      cause = str(predicted_emotion[2] + 1)
    return [text, cause]

  def predict(self, dataset):
    dataloader = DataLoader(dataset=dataset, batch_size=self.batch_size, shuffle=False)
    self.model.eval()
    for batch in dataloader:
      outputs = self.model.predict(batch) # (batch_size, max_utterance, 5)
      if self.subtask == "subtask_1":
        dataset=self.predict_subtask_1(batch, dataset, outputs)
      elif self.subtask == "subtask_2":
        dataset=self.predict_subtask_2(batch, dataset, outputs)

    self.save_prediction(dataset)
    
  def predict_subtask_1(self, batch, dataset, outputs):
    for index in range(len(outputs)):
      predicted_emotion = outputs[index] # (max_utterance, 5)
      # [sentence id, emotion, causal sentence id, casual start index, casual end index]
      conversation_ID = batch['conversation_ID'][index]
      dataset[conversation_ID-1]['emotion-cause_pairs'] = []
      for j in range(len(dataset[conversation_ID-1]['conversation'])):
        dataset[conversation_ID-1]['conversation'][j]['emotion'] = "neutral"

      for j in range(len(predicted_emotion)):
        casual_text = dataset[conversation_ID-1]['conversation'][predicted_emotion[2]]
        dataset[conversation_ID-1]['emotion-cause_pairs'].append(self.emotion_cause_pairs(casual_text, predicted_emotion))
    return dataset
  def predict_subtask_2(self, batch, dataset, outputs):
    for index in range(len(outputs)):
      predicted_emotion = outputs[index] # (max_utterance, 5)
      # [sentence id, emotion, causal sentence id, casual start index, casual end index]
      conversation_ID = batch['conversation_ID'][index]
      dataset[conversation_ID-1]['emotion-cause_pairs'] = []
      for j in range(len(dataset[conversation_ID-1]['conversation'])):
        dataset[conversation_ID-1]['conversation'][j]['emotion'] = "neutral"
        dataset[conversation_ID-1]['conversation'][j]['video_name'] = "dia" + str(conversation_ID) + "utt" + str(j+1)

      for j in range(len(predicted_emotion)):
        casual_text = dataset[conversation_ID-1]['conversation'][predicted_emotion[2]]
        dataset[conversation_ID-1]['emotion-cause_pairs'].append(self.emotion_cause_pairs(casual_text, predicted_emotion, casual=False))
    return dataset
  def save_prediction(self, dataset):
    with open(self.output_json, 'w') as f:
      json.dump(dataset, f)

# **DATASET**

In [9]:
class Dataset(Dataset):
  def __init__(self, file, format="ECF 2.0", subtask=None, tokenizer=None, max_length=16):
    self.format = format
    self.subtask = subtask
    self.speaker_list = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "K", "L", "M", "N", "O", "P", "Q", "W", "R", "T", "U"]
    if format == 'ECF 2.0':
      self.tokenizer = tokenizer
      self.max_conversation_length = file['max_conversation_length']
      self.max_emotion_cause_pairs_length = file['max_emotion_cause_pairs_length']
      if self.subtask == "subtask_1":
        self.text_file = file['subtask_1_text_file']
        self.text_data = json.load(open(self.text_file))
        self.max_length = max_length
        self.text_to_number_mapping = {"neutral": 0, 'surprise': 1, 'anger': 2, "disgust": 3, "fear": 4, "joy": 5, "sadness": 6}
        # "Anger": 0, "Disgust": 1, "Fear": 2, "Joy": 3, "Sadness": 4, "Surprise": 5,
      elif self.subtask == "subtask_2":
        self.text_file = file['subtask_2_text_file']
        self.text_data = json.load(open(self.text_file))
        self.max_length = max_length
        self.text_to_number_mapping = {"Anger": 0, "Disgust": 1, "Fear": 2, "Joy": 3, "Sadness": 4, "Surprise": 5, "neutral": 6}
        self.video_dir = file['video_dir']
  def get_data(self):
    return self.text_data

  def __len__(self):
    return len(self.text_data)

  def emotion_EDA(self):
    self.emotion_counts = {"neutral": 0, 'surprise': 0, 'anger': 0, "disgust": 0, "fear": 0, "joy": 0, "sadness": 0}

    for index in range(len(self.text_data)):
      conversation = self.text_data[index]['conversation']
      for i in range(len(conversation)):
        emotion = conversation[i]['emotion']
        self.emotion_counts[emotion] += 1
    return self.emotion_counts

  def __getitem__(self, index):
    item = self.text_data[index]
    conversation_ID = item['conversation_ID']
    conversation = item['conversation']
    text_conversation = {}
    speaker_conversation = {}
    emotion_conversation = {}
    emotion_conversation = torch.zeros((self.max_conversation_length, len(self.text_to_number_mapping)))
    emotion_cause_pairs_conversation = torch.zeros((self.max_emotion_cause_pairs_length, 5))
    text_conversation['input_ids'] = torch.zeros((self.max_conversation_length, self.max_length))
    text_conversation['token_type_ids'] = torch.zeros((self.max_conversation_length, self.max_length))
    text_conversation['attention_mask'] = torch.zeros((self.max_conversation_length, self.max_length))
    speaker_conversation['input_ids'] = torch.zeros((self.max_conversation_length, self.max_length))
    speaker_conversation['token_type_ids'] = torch.zeros((self.max_conversation_length, self.max_length))
    speaker_conversation['attention_mask'] = torch.zeros((self.max_conversation_length, self.max_length))

    list = []
    start_conversation_index = min(self.max_conversation_length - len(conversation), 0)
    for i in range(len(conversation)):
      truth_index = i+start_conversation_index
      utterance_ID = conversation[i]['utterance_ID']
      text = self.tokenizer(conversation[i]['text'], padding="max_length", max_length=self.max_length, truncation=True, return_tensors="pt")
      text_conversation['input_ids'][truth_index] = text['input_ids']
      text_conversation['token_type_ids'][truth_index] = text['token_type_ids']
      text_conversation['attention_mask'][truth_index] = text['attention_mask']

      if conversation[i]['speaker'] not in list:
        list.append(conversation[i]['speaker'])

      id = list.index(conversation[i]['speaker'])
      speaker = self.tokenizer(self.speaker_list[id], padding="max_length", max_length=self.max_length, truncation=True, return_tensors="pt")

      speaker_conversation['input_ids'][truth_index] = speaker['input_ids']
      speaker_conversation['token_type_ids'][truth_index] = speaker['token_type_ids']
      speaker_conversation['attention_mask'][truth_index] = speaker['attention_mask']
      if "emotion" in conversation[i]:
        emotion = conversation[i]['emotion']
        emotion = self.text_to_number_mapping[emotion]
        emotion_binary_tensor = torch.zeros(len(self.text_to_number_mapping))
        emotion_binary_tensor[emotion] = 1
        emotion_conversation[truth_index] = emotion_binary_tensor
      if self.subtask == "subtask_2":
        video_name = conversation[i]['video_name']
        video_path = os.path.join(self.video_dir, video_name)
        frames, info = read_video(video_path)

        if self.transform:
            frames = [self.transform(frame) for frame in frames]

    if "emotion-cause_pairs" in item:
      for i in range(len(item['emotion-cause_pairs'])):
        emotion_pairs = item['emotion-cause_pairs'][i]
        emotion_declare = item['emotion-cause_pairs'][i][0].split('_')
        emotion_position = int(emotion_declare[0])

        emotion_cause_pairs_declare = item['emotion-cause_pairs'][i][1].split('_')
        emotion_evidence_position, emotion_evidence = int(emotion_cause_pairs_declare[0]) - 1, ' '.join(emotion_cause_pairs_declare[1:])
        emotion_evidence = self.tokenizer(emotion_evidence, padding="max_length", truncation=True, return_tensors="pt")
        start_index, end_index = find_sub_list_indices(text_conversation['input_ids'][emotion_evidence_position + start_conversation_index].tolist(), emotion_evidence['input_ids'].tolist())
        emotion_cause_pairs_conversation[i] = torch.Tensor([emotion_position-1, emotion, emotion_evidence_position, start_index, end_index])

    return {"text_conversation": text_conversation,
            "speaker_conversation": speaker_conversation,
            "labels": emotion_conversation,
            "emotion_cause_pairs_conversation": emotion_cause_pairs_conversation,
            "conversation_ID": conversation_ID,
            }

# **MODEL**

In [10]:
class VH(nn.Module):
  def __init__(self, bert_encoder, label_dim):
    super(VH, self).__init__()
    self.label_dim = label_dim
    self.projection= nn.Linear(512, 1)
    self.bert = bert_encoder
    self.fc = nn.Linear(768, self.label_dim)
    
  def forward(self, text_conversation, speaker_conversation):
    batch_size = text_conversation.size(0)
    max_utterance = text_conversation.size(1)
    print(text_conversation.shape)
    text_conversation = text_conversation.reshape(batch_size*max_utterance, -1)
    x = self.bert(input_ids=text_conversation.long())
    print(x.keys())
    
    
    # utter = torch.zeros(size=(1,0))
    # for index in range(max_utterance):
    #   temp = self.bert(input_ids=text_conversation[:,index,:].long())
    #   if utter[0].nelement():
    #     for key in temp.keys():
    #       utter[key]=torch.cat((utter[key],temp[key]),0)
    #   else:
    #     utter=temp
      
    

# **TEST**

In [11]:
train_dataset = Dataset(file=file_train, format="ECF 2.0", subtask="subtask_1", tokenizer=tokenizer, max_length=max_length)
train_dataset.emotion_EDA()
train_loader = DataLoader(train_dataset, batch_size=2)

In [12]:
model = VH(conversation_encoder, 6).to('cuda:0')
model(next(iter(train_loader))['text_conversation']['input_ids'].to('cuda:0'), next(iter(train_loader))['speaker_conversation']['input_ids'].to('cuda:0'))

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


RuntimeError: CUDA out of memory. Tried to allocate 24.00 MiB (GPU 0; 10.76 GiB total capacity; 8.93 GiB already allocated; 2.56 MiB free; 9.31 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
next(iter(train_loader))['text_conversation']

{'input_ids': tensor([[[   0.,    0.,    0.,  ...,    0.,    0.,    0.],
          [   0.,    0.,    0.,  ...,    0.,    0.,    0.],
          [   0.,    0.,    0.,  ...,    0.,    0.,    0.],
          ...,
          [ 101., 5091., 1125.,  ...,    0.,    0.,    0.],
          [ 101., 1302.,  119.,  ...,    0.,    0.,    0.],
          [ 101., 1398., 1104.,  ...,    0.,    0.,    0.]],
 
         [[   0.,    0.,    0.,  ...,    0.,    0.,    0.],
          [   0.,    0.,    0.,  ...,    0.,    0.,    0.],
          [   0.,    0.,    0.,  ...,    0.,    0.,    0.],
          ...,
          [ 101.,  146., 1202.,  ...,    0.,    0.,    0.],
          [ 101., 1262.,  146.,  ...,    0.,    0.,    0.],
          [ 101., 4858.,  136.,  ...,    0.,    0.,    0.]]]),
 'token_type_ids': tensor([[[0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          ...,
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0

In [None]:
key = next(iter(train_loader)).keys()
for i in key:
    print(i)
    print(next(iter(train_loader))[i])
    print(next(iter(train_loader))[i].shape)

text_conversation
{'input_ids': tensor([[[   0.,    0.,    0.,  ...,    0.,    0.,    0.],
         [   0.,    0.,    0.,  ...,    0.,    0.,    0.],
         [   0.,    0.,    0.,  ...,    0.,    0.,    0.],
         ...,
         [ 101., 5091., 1125.,  ...,    0.,    0.,    0.],
         [ 101., 1302.,  119.,  ...,    0.,    0.,    0.],
         [ 101., 1398., 1104.,  ...,    0.,    0.,    0.]],

        [[   0.,    0.,    0.,  ...,    0.,    0.,    0.],
         [   0.,    0.,    0.,  ...,    0.,    0.,    0.],
         [   0.,    0.,    0.,  ...,    0.,    0.,    0.],
         ...,
         [ 101.,  146., 1202.,  ...,    0.,    0.,    0.],
         [ 101., 1262.,  146.,  ...,    0.,    0.,    0.],
         [ 101., 4858.,  136.,  ...,    0.,    0.,    0.]]]), 'token_type_ids': tensor([[[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0.,

AttributeError: 'dict' object has no attribute 'shape'