<a href="https://colab.research.google.com/github/JeehwanLim/202002_NLP_FIN/blob/main/%EA%B8%B0%EB%A7%90%EA%B3%BC%EC%A0%9C_%EC%98%81%EC%96%B42.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Preparation**

- Edit > Notebook settings > Hardward accelerators > GPU > SAVE
- Download the Friends dataset in EmotionLines website:
http://doraemon.iis.sinica.edu.tw/emotionlines/download.html
- Download the unlabeled json file.

##### **Settings**

In [1]:
!pip install transformers --quiet # package installer for python

[K     |████████████████████████████████| 1.5MB 5.6MB/s 
[K     |████████████████████████████████| 890kB 20.9MB/s 
[K     |████████████████████████████████| 2.9MB 35.2MB/s 
[?25h  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone


In [2]:
!git clone https://github.com/JeehwanLim/202002_NLP_FIN.git

Cloning into '202002_NLP_FIN'...
remote: Enumerating objects: 46, done.[K
remote: Counting objects: 100% (46/46), done.[K
remote: Compressing objects: 100% (45/45), done.[K
remote: Total 46 (delta 22), reused 0 (delta 0), pack-reused 0[K
Unpacking objects: 100% (46/46), done.


In [3]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
import json

import torch
import torch.nn as nn
from transformers import BertModel, BertTokenizer
from tqdm import tqdm_notebook

from sklearn.metrics import precision_score, recall_score, f1_score
import pandas as pd

In [4]:
pretrained_weights = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(pretrained_weights)
model = BertModel.from_pretrained(pretrained_weights)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




# **Emotion Recognition**

##### **Dataset**

In [5]:
data = {'train': {'dia_id': [], 'speaker': [], 'utterance': [], 'emotion': []},
        'dev': {'dia_id': [], 'speaker': [], 'utterance': [], 'emotion': []}}

for dtype in ['train', 'dev']:
  id = 0
  for dialog in json.loads(open('202002_NLP_FIN/friends_' + dtype + '.json').read()):
    for line in dialog:
      data['train']['dia_id'].append(id)
      data['train']['speaker'].append(line['speaker'])
      data['train']['utterance'].append(line['utterance'])
      data['train']['emotion'].append(line['emotion'])
    id = id + 1

for dtype in ['test']:
  id = 0
  for dialog in json.loads(open('202002_NLP_FIN/friends_' + dtype + '.json').read()):
    for line in dialog:
      data['dev']['dia_id'].append(id)
      data['dev']['speaker'].append(line['speaker'])
      data['dev']['utterance'].append(line['utterance'])
      data['dev']['emotion'].append(line['emotion'])
    id = id + 1

In [6]:
e2i_dict = dict((emo, i) for i, emo in enumerate(set(data['train']['emotion'])))
i2e_dict = {i: e for e, i in e2i_dict.items()}

In [7]:
weights = []

for v in i2e_dict.values():
  weights.append(data['train']['emotion'].count(v))

In [8]:
data['train'].items()



In [9]:
e2i_dict.items()

dict_items([('sadness', 0), ('disgust', 1), ('neutral', 2), ('surprise', 3), ('joy', 4), ('fear', 5), ('anger', 6), ('non-neutral', 7)])

In [10]:
i2e_dict.items()

dict_items([(0, 'sadness'), (1, 'disgust'), (2, 'neutral'), (3, 'surprise'), (4, 'joy'), (5, 'fear'), (6, 'anger'), (7, 'non-neutral')])

##### **Model**

In [11]:
class Model(nn.Module):
  def __init__(self):
    super().__init__()
    self.bert_tokenizer = BertTokenizer.from_pretrained(pretrained_weights)
    self.bert_model = BertModel.from_pretrained(pretrained_weights)
    self.linear = torch.nn.Linear(768, len(e2i_dict))

  def forward(self, utterance_bef, utterance_now):
    tokens_tmp = self.bert_tokenizer.tokenize(utterance_bef)
    tokens = ['[CLS]'] + tokens_tmp + ['[SEP]'] # (len)
    tokens_tmp = self.bert_tokenizer.tokenize(utterance_now)
    tokens = tokens + tokens_tmp + ['[SEP]']
    ids = [tokenizer.convert_tokens_to_ids(tokens)] # (bat=1, len)
    input_tensor = torch.tensor(ids).cuda()

    hidden_tensor = self.bert_model(input_tensor)[0] # (bat, len, hid)
    hidden_tensor = hidden_tensor[:, 0, :] # (bat, hid)
    logit = self.linear(hidden_tensor)
    return logit

##### **Evaluation Metrics**

In [12]:
def evaluate(true_list, pred_list):
  precision = precision_score(true_list, pred_list, average=None)
  recall = recall_score(true_list, pred_list, average=None)
  micro_f1 = f1_score(true_list, pred_list, average='micro')
  print('precision:\t', ['%.4f' % v for v in precision])
  print('recall:\t\t', ['%.4f' % v for v in recall])
  print('micro_f1: %.6f' % micro_f1)

##### **Hyper-parameters**

In [13]:
pretrained_weights = 'bert-base-uncased'
learning_rate = 1e-5
n_epoch = 3

##### **Training**

In [None]:
model = Model()
model.cuda()

In [None]:
# model.load_state_dict(torch.load("bert_en_2.pt"))
# model.to(device)

In [15]:
class_weights = torch.tensor(weights, dtype=torch.float32)

class_weights = class_weights / class_weights.sum()
print(class_weights)

class_weights = 1.0 / class_weights
class_weights = class_weights / class_weights.sum()
print(class_weights)

class_weights = torch.FloatTensor(class_weights).cuda()

tensor([0.0352, 0.0224, 0.4466, 0.1168, 0.1198, 0.0182, 0.0509, 0.1901])
tensor([0.1653, 0.2596, 0.0130, 0.0498, 0.0486, 0.3190, 0.1142, 0.0306])


In [16]:
criterion = torch.nn.CrossEntropyLoss(weight=class_weights) # LogSoftmax & NLLLoss
optimizer = torch.optim.Adam(model.parameters(), learning_rate)

for i_epoch in range(n_epoch):
  print('i_epoch:', i_epoch)

  model.train()
  dia_id = -1
  utter_bef = "."

  for i_batch in tqdm_notebook(range(len(data['train']['utterance']))):
    if dia_id != data['train']['dia_id'][i_batch]:
      dia_id = data['train']['dia_id'][i_batch]
      utter_bef = "."
    else:
      utter_bef = data['train']['utterance'][i_batch - 1]
    
    logit = model(utter_bef, data['train']['utterance'][i_batch])

    target = torch.tensor([e2i_dict[data['train']['emotion'][i_batch]]]).cuda()
    loss = criterion(logit, target)
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
  
  model.eval()
  dia_id = -1
  utter_bef = "."

  pred_list, true_list = [], []
  for i_batch in tqdm_notebook(range(len(data['dev']['utterance']))):
    if dia_id != data['dev']['dia_id'][i_batch]:
      dia_id = data['dev']['dia_id'][i_batch]
      utter_bef = "."
    else:
      utter_bef = data['dev']['utterance'][i_batch - 1]

    logit = model(utter_bef, data['dev']['utterance'][i_batch])

    _, max_idx = torch.max(logit, dim=-1)
    pred_list += max_idx.tolist()
    true_list += [e2i_dict[data['dev']['emotion'][i_batch]]]
  evaluate(pred_list, true_list) # print results

i_epoch: 0


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  # This is added back by InteractiveShellApp.init_path()


HBox(children=(FloatProgress(value=0.0, max=11739.0), HTML(value='')))




Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=2764.0), HTML(value='')))


precision:	 ['0.2588', '0.0000', '0.8819', '0.2098', '0.5855', '0.0000', '0.0994', '0.4787']
recall:		 ['0.6667', '0.0000', '0.7054', '0.7895', '0.6593', '0.0000', '0.8000', '0.3426']
micro_f1: 0.604197
i_epoch: 1


  _warn_prf(average, modifier, msg_start, len(result))


HBox(children=(FloatProgress(value=0.0, max=11739.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2764.0), HTML(value='')))


precision:	 ['0.3412', '0.1324', '0.8057', '0.3182', '0.6447', '0.0312', '0.1863', '0.4880']
recall:		 ['0.4394', '0.4500', '0.7418', '0.7398', '0.6323', '1.0000', '0.4762', '0.3372']
micro_f1: 0.599493
i_epoch: 2


HBox(children=(FloatProgress(value=0.0, max=11739.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2764.0), HTML(value='')))


precision:	 ['0.3529', '0.1912', '0.7304', '0.3497', '0.5724', '0.0312', '0.3540', '0.4972']
recall:		 ['0.4225', '0.4483', '0.7667', '0.7194', '0.6541', '0.1429', '0.3931', '0.3053']
micro_f1: 0.573082


In [17]:
# 모델 저장하기
torch.save(model.state_dict(), "bert_en_2.pt")

##### **Labeling**


In [18]:
model.eval()
dia_id = -1
utter_bef = "."

dialogs = pd.read_csv('202002_NLP_FIN/en_data.csv', sep=',')

f = open("en_pred_2.csv", "w")
f.write("Id" + ',' + "Predicted" + '\n')

with tqdm_notebook(total=dialogs.shape[0], desc='Labeling', unit='unit', ascii=True) as pbar:
  for id, i_dialog, i_utterance, utterance in zip(dialogs['id'], dialogs['i_dialog'], dialogs['i_utterance'], dialogs['utterance']):
    if dia_id != i_dialog:
      dia_id = i_dialog
      utter_bef = "."
    else:
      utter_bef = dialogs['utterance'][id - 1]
    
    logit = model(utter_bef, utterance)

    _, max_idx = torch.max(logit, dim=-1)
    pred_emotion = max_idx.tolist()[0]

    f.write(str(id) + ',' + str(i2e_dict[pred_emotion]) + '\n')
    pbar.update(1)

f.close()


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  # Remove the CWD from sys.path while we load stuff.


HBox(children=(FloatProgress(value=0.0, description='Labeling', max=1623.0, style=ProgressStyle(description_wi…


