In [1]:
!nvidia-smi

Fri Jun 25 18:08:20 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 418.39       Driver Version: 418.39       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  GeForce RTX 208...  On   | 00000000:01:00.0 Off |                  N/A |
|  0%   51C    P2    65W / 260W |    894MiB / 10986MiB |     12%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage    

In [2]:
import os
import csv
import random
import numpy as np
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.metrics import confusion_matrix
import torch
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from torch.autograd import Variable
from torch.nn.utils import clip_grad_norm_
from transformers import TrainingArguments, Trainer
from transformers import BertJapaneseTokenizer, AutoTokenizer, AutoModelForSequenceClassification 
from transformers import EarlyStoppingCallback
from transformers import pipeline
torch.cuda.empty_cache()

class Config():
    def __init__(self):
#         self.dropout = 0.1
#         self.weight_decay=1e-4
        self.lr=1e-5
        self.epoches = 500
        self.grad_clip = 10
        self.batch_size = 8
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

ver_dir = 'culdiff_save/001_add_token_type_id/'
os.makedirs(ver_dir, exist_ok=True)

conf=Config()
# tgt_list = ['ja','zh']
tgt = 'ja'
# data_diff_type_list = ['del','add','all']
data_diff_type = 'all'
# label_orientations = ["direct","intense","perspective"]
label_orientations = ["direct"]
# intense_orientations =['downgrader','upgrader','specific','respectful','humble','expect_sth_in_return','irony'] "all"
intense_orientation = "all"

In [3]:
def get_data_as_list(path):
    data = []
    with open(path, 'r', encoding='utf-8-sig')as f:
        reader = csv.reader(f)
        for row in reader:
            data.append(row)
    return data

def cutout_test_set(encoded,lbl):
    ids, att, typ, lbl = np.array(encoded['input_ids']), np.array(encoded['attention_mask']), np.array(encoded['token_type_ids']), np.array(lbl)
    pos_ids, pos_att, pos_typ, pos_lbl = ids[np.where(lbl>=1)], att[np.where(lbl>=1)], typ[np.where(lbl>=1)], lbl[np.where(lbl>=1)]
    neg_ids, neg_att, neg_typ, neg_lbl = ids[np.where(lbl==0)], att[np.where(lbl==0)], typ[np.where(lbl==0)], lbl[np.where(lbl==0)]

    n_pos = pos_ids.shape[0]
    n_neg = neg_ids.shape[0]
    
    # SHUFFLE 
    pos_pureidx = np.arange(n_pos)
    random.shuffle(pos_pureidx)
    pos_ids, pos_att, pos_typ, pos_lbl = pos_ids[pos_pureidx], pos_att[pos_pureidx], pos_typ[pos_pureidx], pos_lbl[pos_pureidx]

    neg_pureidx = np.arange(n_neg)
    random.shuffle(neg_pureidx)
#     neg_pureidx = random.sample(list(neg_pureidx), n_pos)
    neg_ids, neg_att, neg_typ, neg_lbl = neg_ids[neg_pureidx], neg_att[neg_pureidx], neg_typ[neg_pureidx], neg_lbl[neg_pureidx]
    
    # CUTOUT TEST
    n_test = int(pos_ids.shape[0]/5)
    test_pos_ids, test_pos_att, test_pos_typ, test_pos_lbl = pos_ids[:n_test], pos_att[:n_test], pos_typ[:n_test], pos_lbl[:n_test]
    test_neg_ids, test_neg_att, test_neg_typ, test_neg_lbl = neg_ids[:n_test], neg_att[:n_test], neg_typ[:n_test], neg_lbl[:n_test]
    trainval_pos_ids, trainval_pos_att, trainval_pos_typ, trainval_pos_lbl = pos_ids[n_test:], pos_att[n_test:], pos_typ[n_test:], pos_lbl[n_test:]
    trainval_neg_ids, trainval_neg_att, trainval_neg_typ, trainval_neg_lbl = neg_ids[n_test:], neg_att[n_test:], neg_typ[n_test:], neg_lbl[n_test:]
    
    trainval_ids = np.concatenate([trainval_pos_ids, trainval_neg_ids]).tolist()
    trainval_att = np.concatenate([trainval_pos_att, trainval_neg_att]).tolist()
    trainval_typ = np.concatenate([trainval_pos_typ, trainval_neg_typ]).tolist()
    trainval_lbl = np.concatenate([trainval_pos_lbl, trainval_neg_lbl]).tolist()
    test_ids = np.concatenate([test_pos_ids, test_neg_ids]).tolist()
    test_att = np.concatenate([test_pos_att, test_neg_att]).tolist()
    test_typ = np.concatenate([test_pos_typ, test_neg_typ]).tolist()
    test_lbl = np.concatenate([test_pos_lbl, test_neg_lbl]).tolist()
    
    trainval_encoded = {'input_ids': trainval_ids,'attention_mask': trainval_att,'token_type_ids': trainval_typ}
    test_encoded = {'input_ids': test_ids,'attention_mask': test_att,'token_type_ids': test_typ}
    
    return trainval_encoded, trainval_lbl, test_encoded, test_lbl

def make_closs_validation_set(pos_ids, pos_att, pos_typ, pos_lbl, neg_ids, neg_att, neg_typ, neg_lbl):
    train_idx = int(pos_ids.shape[0]/5*4)

    train_pos_ids, train_pos_att, train_pos_typ, train_pos_lbl, train_neg_ids, train_neg_att, train_neg_typ, train_neg_lbl = pos_ids[:train_idx], pos_att[:train_idx], pos_typ[:train_idx], pos_lbl[:train_idx], neg_ids[:train_idx], neg_att[:train_idx], neg_typ[:train_idx], neg_lbl[:train_idx]
    val_pos_ids, val_pos_att, val_pos_typ, val_pos_lbl, val_neg_ids, val_neg_att, val_neg_typ, val_neg_lbl = pos_ids[train_idx:], pos_att[train_idx:], pos_typ[train_idx:], pos_lbl[train_idx:], neg_ids[train_idx:], neg_att[train_idx:], neg_typ[train_idx:], neg_lbl[train_idx:]

    train_ids = np.concatenate([train_pos_ids, train_neg_ids]).tolist()
    train_att = np.concatenate([train_pos_att, train_neg_att]).tolist()
    train_typ = np.concatenate([train_pos_typ, train_neg_typ]).tolist()
    train_lbl = np.concatenate([train_pos_lbl, train_neg_lbl]).tolist()
    val_ids = np.concatenate([val_pos_ids, val_neg_ids]).tolist()
    val_att = np.concatenate([val_pos_att, val_neg_att]).tolist()
    val_typ = np.concatenate([val_pos_typ, val_neg_typ]).tolist()
    val_lbl = np.concatenate([val_pos_lbl, val_neg_lbl]).tolist()
    
    return train_ids,train_att,train_typ,train_lbl,val_ids,val_att,val_typ,val_lbl

def negsam_shuffle_clossval(encoded,lbl):
    ids, att, typ, lbl = np.array(encoded['input_ids']), np.array(encoded['attention_mask']), np.array(encoded['token_type_ids']), np.array(lbl)

    pos_ids, pos_att, pos_typ, pos_lbl = ids[np.where(lbl>=1)], att[np.where(lbl>=1)], typ[np.where(lbl>=1)], lbl[np.where(lbl>=1)]
    neg_ids, neg_att, neg_typ, neg_lbl = ids[np.where(lbl==0)], att[np.where(lbl==0)], typ[np.where(lbl==0)], lbl[np.where(lbl==0)]

    n_pos = pos_ids.shape[0]
    n_neg = neg_ids.shape[0]
    # SHUFFLE POSI
    pos_pureidx = np.arange(n_pos)
    random.shuffle(pos_pureidx)
    pos_ids, pos_att, pos_typ, pos_lbl = pos_ids[pos_pureidx], pos_att[pos_pureidx], pos_typ[pos_pureidx], pos_lbl[pos_pureidx]
    # SHUFFLE & NEGATIVE SAMPLE
    neg_pureidx = np.arange(n_neg)
    neg_pureidx = random.sample(list(neg_pureidx), n_pos)
    neg_ids, neg_att, neg_typ, neg_lbl = neg_ids[neg_pureidx], neg_att[neg_pureidx], neg_typ[neg_pureidx], neg_lbl[neg_pureidx]

    # CLOSS VALIDATION      
    train_ids,train_att,train_typ,train_lbl,val_ids,val_att,val_typ,val_lbl =  make_closs_validation_set(pos_ids, pos_att, pos_typ, pos_lbl, neg_ids, neg_att, neg_typ, neg_lbl)

    train_encoded = {'input_ids': train_ids,'attention_mask': train_att,'token_type_ids': train_typ}
    val_encoded = {'input_ids': val_ids,'attention_mask': val_att,'token_type_ids': val_typ}
    return train_encoded, train_lbl, val_encoded, val_lbl

def train(conf, model, encoded, labels):
    model.train()
    inputs_ids = encoded['input_ids']
    attention_masks = encoded['attention_mask']
    token_type_ids = encoded['token_type_ids']
    inputs_ids = torch.LongTensor(inputs_ids)
    attention_masks = torch.LongTensor(attention_masks)
    token_type_ids = torch.LongTensor(token_type_ids)
    labels = torch.LongTensor(labels)
    loss_ = 0
    idx = 0
    while True:
        if idx+conf.batch_size >= inputs_ids.shape[0]:
            b_inputs_ids = inputs_ids[idx:]
            b_attention_masks = attention_masks[idx:]
            b_token_type_ids = token_type_ids[idx:]
            b_labels = labels[idx:]
        else:
            b_inputs_ids = inputs_ids[idx:idx+conf.batch_size]
            b_attention_masks = attention_masks[idx:idx+conf.batch_size]
            b_token_type_ids = token_type_ids[idx:idx+conf.batch_size]
            b_labels = labels[idx:idx+conf.batch_size]
            
        if torch.cuda.is_available():
#             print(b_inputs_ids.shape)
            b_inputs_ids = b_inputs_ids.to(conf.device)
            b_attention_masks = b_attention_masks.to(conf.device)
            b_token_type_ids = b_token_type_ids.to(conf.device)
            b_labels = b_labels.to(conf.device)
            model.cuda()
        else:
            print('CUDA IS NOT AVALABLE')
        optimizer.zero_grad()  # 一度計算された勾配結果を0にリセット
        output = model(input_ids=b_inputs_ids, attention_mask=b_attention_masks, token_type_ids=b_token_type_ids, labels=b_labels)
        output.loss.backward() 
        clip_grad_norm_(model.parameters(), conf.grad_clip)
        loss_ += output.loss.detach()
        torch.cuda.empty_cache()
        idx += conf.batch_size
        if idx >= inputs_ids.shape[0]:
            break
    return loss_

def val(conf, model, encoded, labels):
    model.eval()
    inputs_ids = encoded['input_ids']
    attention_masks = encoded['attention_mask']
    token_type_ids = encoded['token_type_ids']
    inputs_ids = torch.LongTensor(inputs_ids)
    attention_masks = torch.LongTensor(attention_masks)
    token_type_ids = torch.LongTensor(token_type_ids)
    labels = torch.LongTensor(labels)
    logit = torch.FloatTensor()
    loss_ = 0
    idx = 0
    while True:
        if idx+conf.batch_size >= inputs_ids.shape[0]:
            b_inputs_ids = inputs_ids[idx:]
            b_attention_masks = attention_masks[idx:]
            b_token_type_ids = token_type_ids[idx:]
            b_labels = labels[idx:]
        else:
            b_inputs_ids = inputs_ids[idx:idx+conf.batch_size]
            b_attention_masks = attention_masks[idx:idx+conf.batch_size]
            b_token_type_ids = token_type_ids[idx:idx+conf.batch_size]
            b_labels = labels[idx:idx+conf.batch_size]
        if torch.cuda.is_available():
#             print(b_inputs_ids.shape)
            b_inputs_ids = b_inputs_ids.to(conf.device)
            b_attention_masks = b_attention_masks.to(conf.device)
            b_token_type_ids = b_token_type_ids.to(conf.device)
            b_labels = b_labels.to(conf.device)
            model.cuda()
        else:
            print('CUDA IS NOT AVALABLE')
        with torch.no_grad(): ###
            optimizer.zero_grad()  # 一度計算された勾配結果を0にリセット
            output = model(input_ids=b_inputs_ids, attention_mask=b_attention_masks, token_type_ids=b_token_type_ids, labels=b_labels)
#         output.loss.backward() 
#         clip_grad_norm_(model.parameters(), conf.grad_clip)
#         optimizer.step()
        
        loss_ += output.loss.detach()
        logit = torch.cat((logit, output.logits.detach().cpu()), 0)
        torch.cuda.empty_cache()
        idx += conf.batch_size
        if idx >= inputs_ids.shape[0]:
            break

    return loss_, logit

def compute_metrics(pred, labels):
    pred = np.argmax(pred, axis=1)
    try: # Multi Label
        accuracy = accuracy_score(y_true=labels, y_pred=pred, average='weighted')
    except: # Bin Label
        accuracy = accuracy_score(y_true=labels, y_pred=pred)
    try: # Multi Label
        recall = recall_score(y_true=labels, y_pred=pred, average='weighted')
    except: # Bin Label
        recall = recall_score(y_true=labels, y_pred=pred)
    try: # Multi Label
        precision = precision_score(y_true=labels, y_pred=pred, average='weighted')
    except: # Bin Label
        precision = precision_score(y_true=labels, y_pred=pred)
    try: # Multi Label
        f1 = f1_score(y_true=labels, y_pred=pred, average='weighted')
    except: # Bin Label
        f1 = f1_score(y_true=labels, y_pred=pred)
#     except: # Bin Label
#         accuracy = accuracy_score(y_true=labels, y_pred=pred)
#         recall = recall_score(y_true=labels, y_pred=pred)
#         precision = precision_score(y_true=labels, y_pred=pred)
#         f1 = f1_score(y_true=labels, y_pred=pred)
    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}


In [4]:


for label_orientation in label_orientations:
    pass

if label_orientation == "direct":
    num_labels = 3
elif label_orientation == "intense":
    num_labels = 3
elif label_orientation == "perspective":
    num_labels = 5

if tgt == 'ja':
#     pretrained_model_name = 'daigo/bert-base-japanese-sentiment'
    pretrained_model_name = 'cl-tohoku/bert-base-japanese-whole-word-masking'
    model = AutoModelForSequenceClassification.from_pretrained(pretrained_model_name,num_labels=num_labels) 
    tokenizer = BertJapaneseTokenizer.from_pretrained(pretrained_model_name)
elif tgt == 'zh':
    pretrained_model_name = 'nghuyong/ernie-1.0'
#     ch_pretrained_model_name = 'bert-base-chinese'
    # ch_pretrained_model_name = 'techthiyanes/chinese_sentiment' # _ を処理する必要がある
    # ch_pretrained_model_name = 'hfl/chinese-bert-wwm-ext'
    model = AutoModelForSequenceClassification.from_pretrained(pretrained_model_name,num_labels=num_labels)
    tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name) 
    
optimizer = optim.AdamW(model.parameters(), lr=conf.lr
#                                             weight_decay=conf.weight_decay,
                                            )


Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-whole-word-masking were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialize

# GET DATA & LABELS

In [5]:
ja_sig_list=[   ['del','cejc','query','request','Trust'],
                ['del','cejc','query','thanksgiving','Trust'],
                ['del','cejc','res','request','Trust'],
                ['add','mpdd','query','apology','Disgust'],
                ['add','mpdd','query','request','Sadness'], 
                ['add','mpdd','query','request','Disgust'],
                ['add','mpdd','query','request','Joy'],
                ['add','mpdd','query','thanksgiving','Sadness'],
                ['add','mpdd','query','thanksgiving','Disgust'],
                ['add','mpdd','query','thanksgiving','Trust'], 
                ['add','mpdd','query','thanksgiving','Joy'],
                ['add','mpdd','res','request','Sadness'],
                ['add','mpdd','res','request','Disgust'],
                ['add','mpdd','res','request','Trust'],
                ['add','mpdd','res','request','Joy'],
                ['add','mpdd','res','thanksgiving','Sadness']]
zh_sig_list=[   ['del',	'mpdd',	'query',	'request',		'affect'],
                ['del',	'mpdd',	'query',	'request',		'negemo'],
                ['del',	'mpdd',	'query',	'request',		'anger'],
                ['del',	'mpdd',	'res',	'thanksgiving',	'affect'],
                ['add',	'cejc',	'query',	'apology',	    'affect'],
                ['add',	'cejc',	'query',	'apology',	    'posemo'],
                ['add',	'cejc',	'query',	'apology',	    'negemo'],
                ['add',	'cejc',	'query',	'apology',	    'anger'],
                ['add',	'cejc',	'query',	'request',	    'negemo'],
                ['add',	'cejc',	'res',	'request',	    'affect'],
                ['add',	'cejc',	'res',	'request',	    'posemo']]
sig_list = []

if (tgt=='ja') and (data_diff_type == "add"):
    for sig in ja_sig_list:
        if sig[0] == "add":
            sig_list.append(sig)
    labeled_table_paths = ['JIWC_diff_reason_table.csv']
elif (tgt=='ja') and (data_diff_type == "del"):
    for sig in zh_sig_list:
        if sig[0] == "del":
            sig_list.append(sig)
    labeled_table_paths = ['CLIWC_diff_reason_table.csv']
elif (tgt=='zh') and (data_diff_type == "add"):
    for sig in zh_sig_list:
        if sig[0] == "add":
            sig_list.append(sig)
    labeled_table_paths = ['CLIWC_diff_reason_table.csv']
elif (tgt=='zh') and (data_diff_type == "del"):
    for sig in ja_sig_list:
        if sig[0] == "del":
            sig_list.append(sig)
    labeled_table_paths = ['JIWC_diff_reason_table.csv']
elif (tgt=='ja') and (data_diff_type == "all"):
    for sig in ja_sig_list:
        if sig[0] == "add":
            sig_list.append(sig)
    for sig in zh_sig_list:
        if sig[0] == "del":
            sig_list.append(sig)
    labeled_table_paths = ['JIWC_diff_reason_table.csv','CLIWC_diff_reason_table.csv']
elif (tgt=='zh') and (data_diff_type == "all"):
    for sig in zh_sig_list:
        if sig[0] == "add":
            sig_list.append(sig)
    for sig in ja_sig_list:
        if sig[0] == "del":
            sig_list.append(sig)
    labeled_table_paths = ['JIWC_diff_reason_table.csv','CLIWC_diff_reason_table.csv']

data_pair,labels=[],[]
for labeled_table_path in labeled_table_paths:
    columns_name=['diff_type','corpus','situation','sen_type','emotion','word','htmt','line','part','effect','direct','intense','perspective']
    df = pd.read_csv(labeled_table_path, names=columns_name)

    # REPLACE subordinate concept to MOREorLESS intencity
    if label_orientation != "intense":
        more =   ['lessdowngrader','moreupgrader','morespecific','lessrespectful','lesshumble','add_expect_sth_in_return','add_irony']
        less = ['moredowngrader','lessspecific','lessupgrader','morerespectful','morehumble','rmv_expect_sth_in_return','rmv_irony']
        for m, l in zip(more, less):
            df=df.replace(m,'moreintense')
            df=df.replace(l,'lessintense')
    elif label_orientation == "intense" and intense_orientation == "all":
        more =   ['lessdowngrader','moreupgrader','morespecific','lessrespectful','lesshumble','add_expect_sth_in_return','add_irony']
        less = ['moredowngrader','lessspecific','lessupgrader','morerespectful','morehumble','rmv_expect_sth_in_return','rmv_irony']
        for m, l in zip(more, less):
            df=df.replace(m,'moreintense')
            df=df.replace(l,'lessintense')
    elif label_orientation == "intense":
        pass

    # REPLACE labels to ids
    if label_orientation == "direct":
        df = df.replace(f"more{label_orientation}",2) # more: 2
        df = df.replace(f"less{label_orientation}",1) # less: 1 
        df.loc[~(df[label_orientation].isin([1,2])), label_orientation]= 0 # other: 0
    elif label_orientation == "intense" and intense_orientation == "all":
        df = df.replace(f"more{label_orientation}",2) # more: 2
        df = df.replace(f"less{label_orientation}",1) # less: 1 
        df.loc[~(df[label_orientation].isin([1,2])), label_orientation]= 0 # other: 0    
    elif label_orientation == "intense":
        if intense_orientation in ['upgrader','specific',]:
            df = df.replace(f"more{intense_orientation}",2) # more intense: 2
            df = df.replace(f"less{intense_orientation}",1) # less intense: 1 
        if intense_orientation in ['downgrader','respectful','humble']:
            df = df.replace(f"less{intense_orientation}",2) # more intense: 1
            df = df.replace(f"more{intense_orientation}",1) # less intense: 2 
        elif intense_orientation in ['expect_sth_in_return','irony']:
            df = df.replace(f"add_{intense_orientation}",2) # more intense: 2
            df = df.replace(f"rmv_{intense_orientation}",1) # less intense: 1 
        df.loc[~(df[label_orientation].isin([1,2])), label_orientation]= 0 # other: 0   
    elif label_orientation == "perspective":
        df = df.replace(f"speaker_oriented",int(4)) 
        df = df.replace(f"listener_oriented",int(3))
        df = df.replace(f"speaker_listener_oriented",int(2))
        df = df.replace(f"impersonal_oriented",int(1))
        df.loc[~df[label_orientation].isin([1,2,3,4]), label_orientation] = 0 # other: 0

    df[label_orientation] = df[label_orientation].astype('int8')
    index_names=[]
    tmp_data_pair = []
    tmp_labels = []
    for s in sig_list:
        diff_type=s[0]
        corpus=s[1]
        sen_type=s[2]
        situation=s[3]
        emotion=s[4]
        # FILTER TABLE
        emo_cond = df['diff_type'].isin([diff_type]) & df['corpus'].isin([corpus]) & df['sen_type'].isin([sen_type]) & df['situation'].isin([situation]) & df['emotion'].isin([emotion])
        gizamiss_cond = df['part'].isin(['gizamiss','labelmiss'])
    #     line = df[emo_cond&~gizamiss_cond]['line'].to_list()
    #     labels = df[emo_cond&~gizamiss_cond]['line'].to_list()
        line_list = df[emo_cond]['line'].to_list()
        label_list = df[emo_cond][label_orientation].to_list()
    #     word = df[emo_cond&~gizamiss_cond]['word'].to_list()
        # GET DATA
        if tgt == 'ja':
            MT_path = f'../data/mpdd/{situation}/translated_{sen_type}.csv'
            HT_path = f'../data/mpdd/{situation}/rewrited_{sen_type}.csv'
        elif tgt == 'zh':
            MT_path = f'../data/cejc/{situation}/translated_{sen_type}.csv'
            HT_path = f'../data/cejc/{situation}/rewrited_{sen_type}.csv'

        MT_data = get_data_as_list(MT_path)
        HT_data = get_data_as_list(HT_path)
    #     MT_unaligned = get_mrphdata_as_list(MT_unaligned_path)
    #     HT_unaligned = get_mrphdata_as_list(HT_unaligned_path)
    #     MT_mrph = get_mrphdata_as_list(MT_mrph_path)
    #     HT_mrph = get_mrphdata_as_list(HT_mrph_path)

        for line,label in zip(line_list,label_list):
#             if len(MT_data[line][0]) >= 509:
#                 MT_data[line][0]=MT_data[line][0][:509]
#             if len(HT_data[line][0]) >= 509:
#                 HT_data[line][0]=HT_data[line][0][:509]
            tmp_data_pair.append([MT_data[line][0],HT_data[line][0]])
            tmp_labels.append(label) 
    data_pair.extend(tmp_data_pair)
    labels.extend(tmp_labels)
# display(df[df[label_orientation].isin([1,2])])
print(len(data_pair))
print(len(labels))
print(data_pair[:3])
print(labels[:3])

1283
1283
[['ごめんね！ 私の口調は少し重いです。 王ルオディは単純な女の子なんだから、間に連れてくるのはやめた方がいいよ。 本当は、私に心を寄せるのではなく、良い結果を出すべきなのです。 私のために自分を隠して、自分を大切にしてくれる人との連絡を絶つ必要はないのは言うまでもありません。', '言い方がきつくてごめんね。ただ、王若蝶は単純な子だから、あんまり関わらない方がいいと思う。大事なのは結果を出すことだから、私のことを思ってくれなくてもいいし、私のために他の人とのつながりを断とうとまでは思わないで。'], ['ごめんね！ 誰かにスパイして欲しいと頼みました でも、ホテルで働いてくれって言いに来たわけじゃないんですよ! 会いに行きたかっただけなんですよ！（笑）。 前回出てから思ったんだけど、青城の若い有名ホストや起業家はみんな知ってるよ。 でも、私の記憶の中にあなたのような人はいないわ！ ようやく頭が痛くなってきた！という考えがまだ頭に浮かびませんでした。', 'ごめんなさい。確かに人にあなたを探させたけど、今日来たのはただ顔を見たかっただけなんです。バーで働かせようというのではありません。青城市の名司会者と企業家はみんな知っているけど、あなたみたいな人はいなくて、どうしても思いつかなかったもので……。'], ['すみません！ クオンさんは、あなたの請求書は彼が支払わなければならないと指示しました。 その上、すでに２００００人分の保証金を払っている。', 'すみません。鄺さんから、皆さんの分は絶対に払わせてもらうんだと言われているんです。もう2000元の補償金を出しているのですが……。']]
[0, 0, 0]


# MAKE DATASET

In [6]:
X_train = tokenizer(data_pair, padding=True, truncation=True, max_length=512)
Y_train = labels
######################################################
# CUTOUT TRAINVAL-SET/TEST-SET & SAVE THEM 
X_train, Y_train, test_encoded, test_lbl = cutout_test_set(X_train,Y_train)
print('len of test-set: ', len(test_lbl))

if label_orientation != 'intense':
    save_data_dir = f'{ver_dir}data/{tgt}/{data_diff_type}/{label_orientation}/'
    save_ckpt_dir = f'{ver_dir}ckpt/{tgt}/{data_diff_type}/{label_orientation}/'
    save_log_dir = f'{ver_dir}log/{tgt}/{data_diff_type}/{label_orientation}/'
elif label_orientation == 'intense':
    save_data_dir = f'{ver_dir}data/{tgt}/{data_diff_type}/{label_orientation}/{intense_orientation}/'
    save_ckpt_dir = f'{ver_dir}ckpt/{tgt}/{data_diff_type}/{label_orientation}/{intense_orientation}/'
    save_log_dir = f'{ver_dir}log/{tgt}/{data_diff_type}/{label_orientation}/{intense_orientation}/'
    
os.makedirs(save_data_dir, exist_ok=True)
os.makedirs(save_ckpt_dir, exist_ok=True)
os.makedirs(save_log_dir, exist_ok=True)
os.system(f'rm {save_data_dir}*')
print(f'[!] Clear the checkpoints under {save_data_dir}')
os.system(f'rm {save_ckpt_dir}*')
print(f'[!] Clear the checkpoints under {save_ckpt_dir}')
os.system(f'rm {save_log_dir}*')
print(f'[!] Clear the checkpoints under {save_log_dir}')

with open(f"{save_data_dir}trainval_data", mode="wb") as f:
    pickle.dump([X_train, Y_train], f)
with open(f"{save_data_dir}test_data", mode="wb") as f:
    pickle.dump([test_encoded, test_lbl], f)

test_data_string = []
for t , l in zip(test_encoded['input_ids'],test_lbl):
    t = tokenizer.batch_decode(t,skip_special_tokens=True,clean_up_tokenization_spaces=True)
    test_data_string.append([t,l])
with open(f"{save_data_dir}test_data.string", "w", encoding="utf_8_sig") as f:
    writer = csv.writer(f)
    writer.writerows(test_data_string)
######################################################
# # LOAD TRAINVAL-SET/TEST-SET 
# with open(f"culdiff_data/{tgt}/{label_orientation}/test_data", mode="rb") as f:
#     X_train, Y_train = pickle.load(f)
# with open(f"culdiff_data/{tgt}/{label_orientation}/test_data", mode="rb") as f:
#     test_encoded, test_lbl = pickle.load(f)
######################################################
# del test_encoded, test_lbl, test_data_string


len of test-set:  62
[!] Clear the checkpoints under culdiff_save/001_add_token_type_id/data/ja/all/direct/
[!] Clear the checkpoints under culdiff_save/001_add_token_type_id/ckpt/ja/all/direct/
[!] Clear the checkpoints under culdiff_save/001_add_token_type_id/log/ja/all/direct/


# Train & Validation

In [7]:
train_losses, val_losses, log = [],[],[]
best_metric = -1
min_loss = 999999
print()
for epoch in range(conf.epoches+1):
# Negative Sampling and Shuffling and Clossvalidaition
    X_train_, Y_train_, X_val, Y_val = negsam_shuffle_clossval(X_train, Y_train)
# Train
    train_loss = train(conf, model, X_train_, Y_train_)
# Val
    val_loss, preds = val(conf, model, X_val, Y_val)
    preds = preds.detach().cpu().numpy()
#     Y_val = Y_val.numpy()
    metric_dict = compute_metrics(preds, Y_val)
#     confusion_matrix(Y_val, preds, labels=[0,1,2])
    
#     if (best_metric < metric_dict['f1']) or (min_loss > val_loss):
    if (best_metric < metric_dict['f1']):
        patience = 0
        best_metric = metric_dict['f1']
        min_loss = val_loss
    else:
        patience += 1
    
    state = {'model': model.state_dict(), 
            'optimizer': optimizer.state_dict(), 
            'epoch': epoch}
#     if epoch > 1:
    torch.save(state,f'{save_ckpt_dir}acc_{metric_dict["accuracy"]}_f1_{metric_dict["f1"]}_vloss_{val_loss}_epoch_{epoch}_lr_{conf.lr}_{pretrained_model_name.replace("/",")(")}.pt')
    if patience > 3:
        print(f'[!] early stop')
        break
        
    print('train_loss: {0:<20}, val_loss: {1:<20}, acc: {2:<20}, precision: {3:<20}, recall: {4:<20}, f1: {5:<20}, patience: {6}'.format(train_loss,val_loss,metric_dict['accuracy'],metric_dict['precision'],metric_dict['recall'],metric_dict['f1'],patience))
    log.append([train_loss,val_loss,metric_dict['accuracy'],metric_dict['precision'],metric_dict['recall'],metric_dict['f1'],patience])

log = pd.DataFrame(log, columns=['train_loss','val_loss','accuracy','precision','recall','f1','patience'])
log.to_csv(f'{save_log_dir}{pretrained_model_name.replace("/",")(")}_{epoch}_{conf.lr}_{conf.batch_size}')




  _warn_prf(average, modifier, msg_start, len(result))


train_loss: 27.058481216430664  , val_loss: 7.543033123016357   , acc: 0.46153846153846156 , precision: 0.4341085271317829  , recall: 0.46153846153846156 , f1: 0.3678160919540229  , patience: 0
train_loss: 26.908424377441406  , val_loss: 7.50247859954834    , acc: 0.40384615384615385 , precision: 0.23863636363636365 , recall: 0.40384615384615385 , f1: 0.30000000000000004 , patience: 1
train_loss: 27.569162368774414  , val_loss: 7.464454650878906   , acc: 0.4807692307692308  , precision: 0.41557555919258043 , recall: 0.4807692307692308  , f1: 0.3631077826283305  , patience: 2


  _warn_prf(average, modifier, msg_start, len(result))


train_loss: 27.23859977722168   , val_loss: 7.457955837249756   , acc: 0.4807692307692308  , precision: 0.25510204081632654 , recall: 0.4807692307692308  , f1: 0.33333333333333337 , patience: 3
train_loss: 27.106962203979492  , val_loss: 7.465885639190674   , acc: 0.5                 , precision: 0.7411858974358976  , recall: 0.5                 , f1: 0.3748200863585479  , patience: 0
train_loss: 27.22520637512207   , val_loss: 7.435189247131348   , acc: 0.5                 , precision: 0.2708333333333333  , recall: 0.5                 , f1: 0.35135135135135137 , patience: 1
train_loss: 27.35380744934082   , val_loss: 7.500955104827881   , acc: 0.4807692307692308  , precision: 0.2604166666666667  , recall: 0.4807692307692308  , f1: 0.33783783783783783 , patience: 2
train_loss: 27.18480682373047   , val_loss: 7.538060665130615   , acc: 0.4230769230769231  , precision: 0.25                , recall: 0.4230769230769231  , f1: 0.3142857142857143  , patience: 3
[!] early stop


# Test

In [8]:
def load_best_model(tgt, path, num_labels):
    import torch
    best_acc, best_f1, best_epoch, best_file, best_pretrained_model_name = -1, -1, -1, None, None
    
    for file in os.listdir(path):
        try:
#             /acc_0.5454545454545454_f1_0.4761904761904762_vloss_2.8154916763305664_epoch_4_lr_1e-05_nghuyong)(ernie-1.0.pt
            _, acc, _, f1, _, loss, _, epoch, _, lr, pretrained_model_name = file.split("_")
            pretrained_model_name = pretrained_model_name[:-3]
            pretrained_model_name = pretrained_model_name.replace(")(","/")
        except:
            continue
        acc = float(acc)
        f1 = float(f1)
        epoch = int(epoch)
        if f1 > best_f1:
            best_file = file
            best_epoch = epoch
            best_acc = acc
            best_f1 = f1
            best_loss = loss
            best_pretrained_model_name = pretrained_model_name
        elif f1 == best_f1:
            if best_loss > loss:
                best_file = file
                best_epoch = epoch
                best_acc = acc
                best_f1 = f1
                best_loss = loss
                best_pretrained_model_name = pretrained_model_name               
    if best_file:
        file_path = path + best_file
        if tgt == 'ja':
            tokenizer = BertJapaneseTokenizer.from_pretrained(best_pretrained_model_name)
        else:
            tokenizer = AutoTokenizer.from_pretrained(best_pretrained_model_name)
        model = AutoModelForSequenceClassification.from_pretrained(best_pretrained_model_name,num_labels=num_labels)
        model.load_state_dict(torch.load(file_path)['model'])
        print(f'[!] Load the model from {file_path}')
    else:
        raise Exception(f"[!] No saved model")
    return model, tokenizer

In [9]:

if label_orientation != 'intense':
    test_data_dir = f'{ver_dir}data/{tgt}/{data_diff_type}/{label_orientation}/'
    ckpt_dir = f'{ver_dir}ckpt/{tgt}/{data_diff_type}/{label_orientation}/'
    test_result_dir = f'{ver_dir}test/{tgt}/{data_diff_type}/{label_orientation}/'
elif label_orientation == 'intense':
    test_data_dir = f'{ver_dir}data/{tgt}/{data_diff_type}/{label_orientation}/{intense_orientation}/'
    ckpt_dir = f'{ver_dir}ckpt/{tgt}/{data_diff_type}/{label_orientation}/{intense_orientation}/'
    test_result_dir = f'{ver_dir}test/{tgt}/{data_diff_type}/{label_orientation}/{intense_orientation}/'
    
model,tokenizer = load_best_model(tgt, ckpt_dir, num_labels)

val_loss, preds = val(conf, model, test_encoded, test_lbl)
preds = preds.detach().cpu().numpy()
metric_dict = compute_metrics(preds, test_lbl)

os.makedirs(f'{test_result_dir}', exist_ok=True)
os.system(f'rm {test_result_dir}*')
print(f'[!] Clear the checkpoints under {test_result_dir}')

confusion_matrix_ = confusion_matrix(test_lbl, np.argmax(preds,axis=1), labels=list(range(num_labels)))
confusion_matrix_ = pd.DataFrame(confusion_matrix_)
confusion_matrix_.to_csv(f'{test_result_dir}confusion_matrix.csv',header=True, index=True)
display(confusion_matrix_)

preds = pd.DataFrame(preds)
preds.to_csv(f'{test_result_dir}logits.csv',header=False, index=False)

log = pd.DataFrame([metric_dict['accuracy'],metric_dict['precision'],metric_dict['recall'],metric_dict['f1']], index=['accuracy','precision','recall','f1'])
log.to_csv(f'{test_result_dir}metric.csv',header=False, index=True)
print('acc: {0:<20}, precision: {1:<20}, recall: {2:<20}, f1: {3:<20}'.format(metric_dict['accuracy'],metric_dict['precision'],metric_dict['recall'],metric_dict['f1']))

Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-whole-word-masking were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialize

[!] Load the model from culdiff_save/001_add_token_type_id/ckpt/ja/all/direct/acc_0.5_f1_0.3748200863585479_vloss_7.465885639190674_epoch_4_lr_1e-05_cl-tohoku)(bert-base-japanese-whole-word-masking.pt
[!] Clear the checkpoints under culdiff_save/001_add_token_type_id/test/ja/all/direct/


Unnamed: 0,0,1,2
0,28,2,1
1,27,0,4
2,0,0,0


acc: 0.45161290322580644 , precision: 0.2545454545454545  , recall: 0.45161290322580644 , f1: 0.32558139534883723 
