In [1]:
!nvidia-smi

Mon Jun 21 21:20:35 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 418.39       Driver Version: 418.39       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  GeForce RTX 208...  On   | 00000000:01:00.0 Off |                  N/A |
|  0%   51C    P8    28W / 260W |   2687MiB / 10986MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage    

In [2]:
import os
import random
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
import torch
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from torch.autograd import Variable
from torch.nn.utils import clip_grad_norm_
from transformers import TrainingArguments, Trainer
from transformers import BertJapaneseTokenizer, AutoTokenizer, AutoModelForSequenceClassification 
from transformers import EarlyStoppingCallback
from transformers import pipeline
torch.cuda.empty_cache()

class Config():
    def __init__(self):
        self.dropout = 0.5
        self.weight_decay=1e-4
        self.lr=1e-5
        self.epoches = 500
        self.grad_clip = 10
        self.batch_size = 8
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


In [3]:
def negsam_shuffle(encoded,lbl):
    ids, att, lbl = np.array(encoded['input_ids']), np.array(encoded['attention_mask']), np.array(lbl)

    pos_ids, pos_att, pos_lbl = ids[np.where(lbl==1)], att[np.where(lbl==1)], lbl[np.where(lbl==1)]
    neg_ids, neg_att, neg_lbl = ids[np.where(lbl!=1)], att[np.where(lbl!=1)], lbl[np.where(lbl!=1)]

    n_pos = pos_ids.shape[0]
    n_neg = neg_ids.shape[0]

    pos_pureidx = np.arange(n_pos)
    random.shuffle(pos_pureidx)
    pos_ids, pos_att, pos_lbl = pos_ids[pos_pureidx], pos_att[pos_pureidx], pos_lbl[pos_pureidx]

    neg_pureidx = np.arange(n_neg)
    neg_pureidx = random.sample(list(neg_pureidx), n_pos)
    neg_ids, neg_att, neg_lbl = neg_ids[neg_pureidx], neg_att[neg_pureidx], neg_lbl[neg_pureidx]

    ids = np.concatenate([pos_ids, neg_ids]).tolist()
    att = np.concatenate([pos_att, neg_att]).tolist()
    lbl = np.concatenate([pos_lbl, neg_lbl]).tolist()
#     print(ids)
#     print(att)
#     print(lbl)
    negsam_shuffled = {'input_ids': ids,'attention_mask': att}
    return negsam_shuffled, lbl

def train(conf, model, encoded, labels):
    model.train()
    inputs_ids = encoded['input_ids']
    attention_masks = encoded['attention_mask']
    inputs_ids = torch.LongTensor(inputs_ids)
    attention_masks = torch.LongTensor(attention_masks)
    labels = torch.LongTensor(labels)
    loss_ = 0
    idx = 0
    while True:
        if idx+conf.batch_size >= inputs_ids.shape[0]:
            b_inputs_ids = inputs_ids[idx:]
            b_attention_masks = attention_masks[idx:]
            b_labels = labels[idx:]
        else:
            b_inputs_ids = inputs_ids[idx:idx+conf.batch_size]
            b_attention_masks = attention_masks[idx:idx+conf.batch_size]
            b_labels = labels[idx:idx+conf.batch_size]
            
        if torch.cuda.is_available():
#             print(b_inputs_ids.shape)
            b_inputs_ids = b_inputs_ids.to(conf.device)
            b_attention_masks = b_attention_masks.to(conf.device)
            b_labels = b_labels.to(conf.device)
            model.cuda()
        else:
            print('CUDA IS NOT AVALABLE')
        optimizer.zero_grad()  # 一度計算された勾配結果を0にリセット
        output = model(input_ids=b_inputs_ids, attention_mask=b_attention_masks, labels=b_labels)
        output.loss.backward() 
        clip_grad_norm_(model.parameters(), conf.grad_clip)
        loss_ += output.loss.detach()
        torch.cuda.empty_cache()
        idx += conf.batch_size
        if idx >= inputs_ids.shape[0]:
            break
    return loss_

def val(conf, model, encoded, labels):
    model.eval()
    inputs_ids = encoded['input_ids']
    attention_masks = encoded['attention_mask']
    inputs_ids = torch.LongTensor(inputs_ids)
    attention_masks = torch.LongTensor(attention_masks)
    labels = torch.LongTensor(labels)
    logit = torch.FloatTensor()
    loss_ = 0
    idx = 0
    while True:
        if idx+conf.batch_size >= inputs_ids.shape[0]:
            b_inputs_ids = inputs_ids[idx:]
            b_attention_masks = attention_masks[idx:]
            b_labels = labels[idx:]
        else:
            b_inputs_ids = inputs_ids[idx:idx+conf.batch_size]
            b_attention_masks = attention_masks[idx:idx+conf.batch_size]
            b_labels = labels[idx:idx+conf.batch_size]
            
        if torch.cuda.is_available():
#             print(b_inputs_ids.shape)
            b_inputs_ids = b_inputs_ids.to(conf.device)
            b_attention_masks = b_attention_masks.to(conf.device)
            b_labels = b_labels.to(conf.device)
            model.cuda()
        else:
            print('CUDA IS NOT AVALABLE')
        with torch.no_grad(): ###
            optimizer.zero_grad()  # 一度計算された勾配結果を0にリセット
            output = model(input_ids=b_inputs_ids, attention_mask=b_attention_masks, labels=b_labels)
#         output.loss.backward() 
#         clip_grad_norm_(model.parameters(), conf.grad_clip)
#         optimizer.step()
        
        loss_ += output.loss.detach()
        logit = torch.cat((logit, output.logits.detach().cpu()), 0)
        torch.cuda.empty_cache()
        idx += conf.batch_size
        if idx >= inputs_ids.shape[0]:
            break
    
    
    
#     inputs_ids = encoded['input_ids']
#     attention_masks = encoded['attention_mask']
#     model.eval()
#     with torch.no_grad():  
#         if torch.cuda.is_available():
#             inputs_ids = torch.LongTensor(inputs_ids).to(conf.device)
#             attention_masks = torch.LongTensor(attention_masks).to(conf.device)
#             labels = torch.LongTensor(labels).to(conf.device)
#             model.cuda()
#             loss, logit = model(input_ids=inputs_ids, attention_mask=attention_masks, labels=labels)
#         else:
#             print('CUDA IS NOT AVALABLE')
    return loss_, logit

def compute_metrics(pred, labels):
    pred = np.argmax(pred, axis=1)
    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred)
    precision = precision_score(y_true=labels, y_pred=pred)
    f1 = f1_score(y_true=labels, y_pred=pred)
    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}


In [4]:
# tgt_list = ['ja','zh']
# seg_list = ['train','dev','test']
conf=Config()
tgt = 'ja'
corpus = 'Friends'
label_orientation = "posinega"

if tgt == 'ja':
#     pretrained_model_name = 'daigo/bert-base-japanese-sentiment'
    pretrained_model_name = 'cl-tohoku/bert-base-japanese-whole-word-masking'
    model = AutoModelForSequenceClassification.from_pretrained(pretrained_model_name,num_labels=2) 
    tokenizer = BertJapaneseTokenizer.from_pretrained(pretrained_model_name)
elif tgt == 'zh':
    pretrained_model_name = 'nghuyong/ernie-1.0'
#     ch_pretrained_model_name = 'bert-base-chinese'
    # ch_pretrained_model_name = 'techthiyanes/chinese_sentiment' # _ を処理する必要がある
    # ch_pretrained_model_name = 'hfl/chinese-bert-wwm-ext'
    model = AutoModelForSequenceClassification.from_pretrained(pretrained_model_name,num_labels=2)
    tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name) 
    
if corpus == 'Friends':
    if label_orientation == "posinega":
        negative_labels = ['surprise','fear','anger','disgust','sadness']
        positive_labels = 'joy'
    else:
        pass
    
optimizer = optim.AdamW(model.parameters(), lr=conf.lr
                                            # weight_decay=conf.weight_decay,
                                            )
train_path = f'/nfs/nas-7.1/yamashita/LAB/giza-pp/sentiment_analysis/EmotionLines/{corpus}_{tgt}/friends_train'
val_path = f'/nfs/nas-7.1/yamashita/LAB/giza-pp/sentiment_analysis/EmotionLines/{corpus}_{tgt}/friends_dev'

columns_name = ['dialogue id','utterance id','speaker','text','emotion','annotation']
train_data = pd.read_csv(train_path, names=columns_name)
val_data = pd.read_csv(val_path, names=columns_name)

train_data['emotion'] = train_data['emotion'].replace(negative_labels,0) # negative
train_data['emotion'] = train_data['emotion'].replace(positive_labels,1) # positive
train_data = train_data[train_data['emotion'].isin([0,1])]
val_data['emotion'] = val_data['emotion'].replace(negative_labels,0) # negative
val_data['emotion'] = val_data['emotion'].replace(positive_labels,1) # positive
val_data = val_data[val_data['emotion'].isin([0,1])]

# ##### DROP NAN
# train_data.dropna(axis = 0, how ='any', inplace = True)
# val_data.dropna(axis = 0, how ='any', inplace = True)

X_train = list(train_data['text'])
Y_train = list(train_data['emotion'])
X_val = list(val_data['text'])
Y_val = list(val_data['emotion'])

X_train = tokenizer(X_train, padding=True, truncation=True, max_length=512)
X_val = tokenizer(X_val, padding=True, truncation=True, max_length=512)
      
os.makedirs(f'ckpt/{corpus}/{tgt}/{label_orientation}/', exist_ok=True)
os.makedirs(f'log/{corpus}/{tgt}/{label_orientation}/', exist_ok=True)

train_losses, val_losses, log = [],[],[]
best_metric = -1
min_loss = 999999
for epoch in range(conf.epoches+1):
# Negative Sampling and Shuffling
    X_train_, Y_train_ = negsam_shuffle(X_train, Y_train)
# Train
    train_loss = train(conf, model, X_train_, Y_train_)
# Val
    val_loss, preds = val(conf, model, X_val, Y_val)
    preds = preds.detach().cpu().numpy()
#     Y_val = Y_val.numpy()
    metric_dict = compute_metrics(preds, Y_val)
    
#     if (best_metric < metric_dict['f1']) or (min_loss > val_loss):
    if (best_metric < metric_dict['f1']):
        patience = 0
        best_metric = metric_dict['f1']
        min_loss = val_loss
    else:
        patience += 1
    
    state = {'model': model.state_dict(), 
            'optimizer': optimizer.state_dict(), 
            'epoch': epoch}
    if epoch > 3:
        torch.save(state,f'ckpt/{corpus}/{tgt}/{label_orientation}/acc_{metric_dict["accuracy"]}_f1_{metric_dict["f1"]}_vloss_{val_loss}_epoch_{epoch}_lr_{conf.lr}_{pretrained_model_name.replace("/",")(")}.pt')
    if patience > 3:
        print(f'[!] early stop')
        break
        
    print('train_loss: {0:<20}, val_loss: {1:<20}, acc: {2:<20}, precision: {3:<20}, recall: {4:<20}, f1: {5:<20}, patience: {6}'.format(train_loss,val_loss,metric_dict['accuracy'],metric_dict['precision'],metric_dict['recall'],metric_dict['f1'],patience))
    log.append([train_loss,val_loss,metric_dict['accuracy'],metric_dict['precision'],metric_dict['recall'],metric_dict['f1'],patience])

log = pd.DataFrame(log, columns=['train_loss','val_loss','accuracy','precision','recall','f1','patience'])
log.to_csv(f'log/{corpus}/{tgt}/{label_orientation}/{pretrained_model_name.replace("/",")(")}_{epoch}_{conf.lr}_{conf.batch_size}')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=479.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=445021143.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-whole-word-masking were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialize

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=257706.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=110.0, style=ProgressStyle(description_…


train_loss: 221.32713317871094  , val_loss: 32.43389892578125   , acc: 0.7293868921775899  , precision: 0.391304347826087   , recall: 0.07317073170731707 , f1: 0.1232876712328767  , patience: 0
train_loss: 203.6045684814453   , val_loss: 20.645946502685547  , acc: 0.864693446088795   , precision: 0.9154929577464789  , recall: 0.5284552845528455  , f1: 0.6701030927835052  , patience: 0
train_loss: 204.27496337890625  , val_loss: 8.362760543823242   , acc: 0.9682875264270613  , precision: 0.9736842105263158  , recall: 0.9024390243902439  , f1: 0.9367088607594938  , patience: 0
train_loss: 225.0279541015625   , val_loss: 3.3167600631713867  , acc: 0.9830866807610994  , precision: 0.967479674796748   , recall: 0.967479674796748   , f1: 0.967479674796748   , patience: 0
train_loss: 276.6372985839844   , val_loss: 2.094297409057617   , acc: 0.9894291754756871  , precision: 0.9758064516129032  , recall: 0.983739837398374   , f1: 0.979757085020243   , patience: 0
train_loss: 354.3435974121094