In [1]:
import os
import time
import csv
import math
import gc

import pandas as pd
from pprint import pprint
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.nn.utils import clip_grad_norm_

from transformers import BertTokenizer, BertModel ,AdamW
from transformers.tokenization_bert_japanese import BertJapaneseTokenizer


# Functions for get SenVec of Original text

In [2]:
def get_data(f_path):
    text_data = []
    with open(f_path, 'r', encoding='utf_8_sig') as f:
        for line in f:
            text_data.append(line[:-1])
#             text_data.append(line[:-1].split(' '))
    return text_data

def encode(tokenizer, data):
    
    input_ids=[]
    attention_mask=[]
    token_type_ids=[]
    for d in data:
        encoded = tokenizer.encode_plus(d,
                                    max_length = 128,
                                    truncation = True,
                                    padding = 'max_length',
                                    add_special_tokens = True,
                                    pad_to_max_length=True,
                                    return_attention_mask=True
                                    )
        input_ids.append(encoded['input_ids'])
        attention_mask.append(encoded['attention_mask'])
        token_type_ids.append(encoded['token_type_ids'])
    input_ids= torch.LongTensor(input_ids)
    attention_mask= torch.LongTensor(attention_mask)
    token_type_ids= torch.LongTensor(token_type_ids)
    return input_ids, attention_mask, token_type_ids 

def on_gpu(input_ids, attention_mask, token_type_ids, device):
    if torch.cuda.is_available():
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        token_type_ids = token_type_ids.to(device)
    return input_ids, attention_mask, token_type_ids

# Functions for get SenVec of Generated text

In [3]:
def get_data_as_list(fpath) -> list:
    data = []
    with open(fpath, "r", encoding="utf-8-sig") as f:
        for line in f:
            line = line.replace("\n","").split(" ")
            data.append(line)
    return data

def get_diff_sentences(df,corpus="mpdd", situation='apology', sen_type='query', diff_type='del', percentile=75, top=True, thirdcol_obj='primitive form'):
    def count(x):
        return x.count() 
    df = df[df['corpus'].isin([corpus])&df['situation'].isin([situation])&df['sentence type'].isin([sen_type])&df['difference type'].isin([diff_type])]
    unaligned_count = df.groupby(['corpus', 'situation', 'method', 'sentence type', 'difference type','line'])['index'].apply(count).reset_index()
    unaligned_count = unaligned_count.rename(columns={'index': 'count'})
    
#     unaligned_count['quantile'] = unaligned_count.groupby(['corpus', 'situation', 'method', 'sentence type', 'difference type',])['count'].transform(lambda group: np.quantile(group,quantile))
#     print(unaligned_count)
#     if top==True:
#         diff_df = unaligned_count.groupby(['corpus', 'situation', 'method', 'sentence type', 'difference type','line']).filter(lambda group: group['count']>=group['quantile']).reset_index()
#     else:
#         diff_df = unaligned_count.groupby(['corpus', 'situation', 'method', 'sentence type', 'difference type','line']).filter(lambda group: group['count']<=group['quantile']).reset_index()

    unaligned_count.set_index('line',inplace=True)
    translated_dict = unaligned_count[unaligned_count['method']=='translated']['count'].to_dict()
    rewrited_dict = unaligned_count[unaligned_count['method']=='rewrited']['count'].to_dict()
    calced_dict={}
    for i in range(max(max(translated_dict),max(rewrited_dict))+1):
        if (i in translated_dict) and (i in rewrited_dict):
            calced_dict[i] = rewrited_dict[i] - translated_dict[i]
        elif (i not in translated_dict) and (i in rewrited_dict):
            calced_dict[i] = rewrited_dict[i]
        elif (i  in translated_dict) and (i not in rewrited_dict):
            calced_dict[i] = - translated_dict[i]
        else:
            pass
    values = list(calced_dict.values())
    threshold = np.percentile(np.array(values),percentile)
    calced_dict = dict(sorted(calced_dict.items(), key=lambda i: i[1], reverse=True))
#     print(calced_dict)
#     method_type = ['original','translated','rewrited']
    method_type = ['translated','rewrited']
#     sentence_type = ['query', 'res']
    sentence_type = [sen_type]
#     dirs = ['../mrphdata/']
    dirs = ['/nfs/nas-7.1/yamashita/LAB/BertRuber/data']
    print(f'{corpus} {situation} {diff_type}')
    translated = []
    rewrited = []
    for line, val in calced_dict.items():
        if top==True and val < threshold:
            continue
        if top==False and val > threshold:
            continue
        for t in sentence_type:
            for m in method_type:
                for d in dirs:
                    path =f'{d}/{corpus}/{situation}/{m}_{t}.csv'
                    with open(path, 'r', encoding='utf_8_sig')as f:
                        for i,sen in enumerate(f):
                            if i == line:
                                if m=='translated':
                                    translated.append(sen[:-1])
                                elif m == 'rewrited':
                                    rewrited.append(sen[:-1])
#                                     print(sen)
#                                 print('[{0:^5}] {1:<11} {2:<6}: {3}'.format(i,m,t,sen[:-1]))
#         unaligned_tr = df[(df['line'].isin([line]))&(df['method'].isin(['translated']))]['word'].to_list()
#         unaligned_re = df[(df['line'].isin([line]))&(df['method'].isin(['rewrited']))]['word'].to_list()
# #             print(type(df))
#         print('{0:<11}: {1}'.format('translated',' '.join(unaligned_tr)))
#         print('{0:<11}: {1}'.format('rewrited',' '.join(unaligned_re)))
#         print()
    return translated, rewrited

In [4]:
def cos_similarity(groundtruth, generated):
    try:
        sim = np.dot(groundtruth, generated) / (np.linalg.norm(groundtruth) * np.linalg.norm(generated))
    except ZeroDivisionError:
        sim = 0.0
    return sim
def calc_score(original, translated):
    sim = np.zeros((translated.shape[0],original.shape[0]))
    for i, t in enumerate(translated):
        for j, o in enumerate(original):
            sim[i][j] = cos_similarity(t,o)
    return sim

# For Simplifing Codes

In [5]:
def convert2senvec(tokenizer,model,device,data):
    input_ids, attention_mask, token_type_ids  = encode(tokenizer,data)
#     print(input_ids)
    # input_tokens = tokenizer.convert_ids_to_tokens([x for x in input_ids[0]])
    # print(input_tokens)
    input_ids, attention_mask, token_type_ids = on_gpu(input_ids, attention_mask, token_type_ids, device)
    with torch.no_grad():
        outputs = model(input_ids,attention_mask,token_type_ids,return_dict=False)
    last_hidden_states = outputs[0]
#     print(last_hidden_states.size())
    sentencevec = last_hidden_states[:,0,:]
#     print(sentencevec.size())
    sentencevec=sentencevec.cpu().detach().numpy()
    return sentencevec

# Function for get word-ranking for removal

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

def calc_tfidf(data):
    tfidf = TfidfVectorizer()
    x = tfidf.fit_transform(data)
    df_tfidf = pd.DataFrame(x.toarray(), columns=tfidf.get_feature_names())
    return df_tfidf

In [32]:
def rank_words_by_unalighed_diff_counts(df,corpus='cejc',situation='apology',sen_type='query', diff_type='del', reverse=False, tfidf=False):
    # FOR creating words ranking graph and pos rangling graph
#     cols = {'primitive form':'word count', 'pos':'pos count'}
    df = df[(df['corpus']==corpus)&(df['situation']==situation)&(df['sentence type']==sen_type)&(df['difference type']==diff_type)]
    print('tfidf',tfidf) 
    cols = {'primitive form':'word count'}
    for label_col, data_col in cols.items():
        if tfidf == True:
            df = df.assign(tfidf = list(np.zeros(len(df))))
            dirs = ['/nfs/nas-7.1/yamashita/LAB/giza-pp/primitive']
            corpora = ['cejc']
            situations = ['apology']
            method_type = ['rewrited']
            sentence_type = ['query']
            data = []
            for d in dirs:
                for c in corpora:
                    for s in situations:
                        for m in method_type:
                            if m != 'original':
                                c = 'mpdd' if corpus == 'cejc' else 'mpdd'
                            for t in sentence_type:
                                path =f'{d}/{c}/{s}/{m}_{t}'
                                with open(path, 'r', encoding='utf_8_sig')as f:
                                    for i,sen in enumerate(f):
                                        line = sen[:-1]
                                        data.append(line)
            tfidf_df=calc_tfidf(data)
            for index, row in df.iterrows():
                line= row['line']
                prim= row['primitive form']
                try:
                    df.loc[index, 'tfidf'] = tfidf_df.loc[line, prim]
                except:
#                     print(prim)
                        pass
                
            df[data_col] = df.groupby(['corpus', 'situation', 'method', 'sentence type', 'difference type',label_col])['tfidf'].transform(np.sum)
        else:
            print(False)
            df[data_col] = df.groupby(['corpus', 'situation', 'method', 'sentence type', 'difference type',label_col])['index'].transform('count')
        
        df_word = df.drop_duplicates(['corpus', 'situation', 'method', 'sentence type', 'difference type',label_col]).sort_values(by=data_col, ascending=False)
        df_word = df_word.dropna(subset=[label_col])

        # GET data for 1 graph
#         difference_type = ['del','add']
        method_type = ['translated','rewrited']
        data, labels = [], []
        _temp_w, _temp_wv = [], [] 
        for method in method_type:
            _temp_w.append(df_word[df_word['method']==method][label_col])
            _temp_wv.append(df_word[df_word['method']==method][data_col])
        # PUT data into dictionary, COUNT freq., CALCULATE (rewrited - translated) and SORT them
        dic = {}
        for i,(wline,vline) in enumerate(zip(_temp_w,_temp_wv)):
            for j,(w,v) in enumerate(zip(wline,vline)):
                if w in dic:
                    dic[w]['each'][i]=v
                    v = v*-1 if i==0 else v
                    dic[w]['diff']=dic.get(w,dic.get('diff',0))['diff']+v
                else:
                    tmp = [0,0]
                    tmp[i] = v
                    v = v*-1 if i==0 else v
                    dic.setdefault(w,{'each':tmp,'diff':v})
        dic = sorted(dic.items(),key=lambda x:x[1]['diff'],reverse=reverse)
#         pprint(dic)
        # exit()
#         data_w, data_wv = [],[[],[]]
#         for key, values in dic:
#             data_w.append(key)
#             data_wv[0].append(values['each'][0])
#             data_wv[1].append(values['each'][1]) 
        data_w, data_wv = [],[]
        for key, values in dic:
            data_w.append(key)
            data_wv.append(values['diff'])
        labels.append(data_w)
        data.append(data_wv)
        df_ = pd.DataFrame([data_w,data_wv]).T.set_axis(['word','diff count'],axis='columns')
        print(df_)
        return df_
    
def prim2mrph(df,df_,corpus,situation,sen_type,diff_type,mincnt):
    ranked_words_list = df_[df_['diff count'] >= mincnt]['word'].to_list()
#     print(ranked_words_list)
#     print(len(ranked_words_list))
    df = df[df["corpus"].isin([corpus])&\
            df['situation'].isin([situation])&\
            df["difference type"].isin([diff_type])&\
            df["primitive form"].isin(ranked_words_list)&\
            df['sentence type'].isin([sen_type])]['word']
    word_list = df.to_list()
    return word_list

def get_highdiff_mrphwords(df,corpus,situation,sen_type,diff_type,mincnt,reverse=False,tfidf=False):
    if diff_type=='add':
        corpus = 'mpdd' if corpus == 'cejc' else 'cejc'
    df_ = rank_words_by_unalighed_diff_counts(df,corpus,situation,sen_type,diff_type,reverse,tfidf)
    word_list = prim2mrph(df,df_,corpus,situation,sen_type,diff_type,mincnt)
    return word_list

# Get SenVec

In [35]:
def get_senvecs(corpus,situation,sen_type,diff_type,percentage,top=True,rm_mincount=3,remove=False,tfidf=False):
    # SET ENV USING BERT
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    if corpus == "cejc":
        pretrained_model = 'cl-tohoku/bert-base-japanese-whole-word-masking'
    elif corpus == "mpdd":
        pretrained_model = 'bert-base-chinese'
    else:
        pretrained_model = 'bert-base-multilingual-uncased'
    print(pretrained_model)
    tokenizer = BertTokenizer.from_pretrained(pretrained_model)
    model = BertModel.from_pretrained(pretrained_model)

    if torch.cuda.is_available():
        model.cuda()
    
    # GET ORIGINAL-TEXT AND ANALYSIS-TABLE
    data_dir = '../data/'
    original_path = '{}/{}/{}/original_{}.csv'.format(data_dir,corpus,situation,sen_type)
    data = get_data(original_path)
    
    t_path = "analysis_table.csv"
    # t_path = "analysis_table_upper.csv"
    df = pd.read_csv(t_path)
#         print(data)
#     print('original[0]:    ',data[0])
    sentencevec = convert2senvec(tokenizer,model,device,data)

    tgt_corpus='mpdd' if corpus == 'cejc' else 'cejc'
    translated, rewrited = get_diff_sentences(df,tgt_corpus,situation,sen_type,diff_type,percentage,top)
        
    # REMOVE HIGH-DIFF-COUNT-WORDS FROM ORIGINAL-TEXT
    if remove == True:
        highdiff_mrphwords = get_highdiff_mrphwords(df,corpus,situation,sen_type,diff_type,rm_mincount,remove,tfidf)
        if diff_type == 'del':
            for j,w in enumerate(highdiff_mrphwords):
#                 if j<=8:
#                     continue
                for i, d in enumerate(translated):
                    if w in d:
                        translated[i]=d.replace(w,'')
        if diff_type == 'add':
            for j,w in enumerate(highdiff_mrphwords):
                if type(w) == str:
                    pass
                elif math.isnan(w):
                    continue
#                 if j<=8:
#                     continue
                for i, (d,r) in enumerate(zip(data,rewrited)):
#                     if w in d:
#                         data[i]=d.replace(w,'')
                    if w in r:
                        rewrited[i]=r.replace(w,'')
            
    
    print('translated[0]: ', translated[0])
    print('rewrited[0]:   ', rewrited[0])
    t_sentencevec = convert2senvec(tokenizer,model,device,translated)
    r_sentencevec = convert2senvec(tokenizer,model,device,rewrited)
    return sentencevec,t_sentencevec,r_sentencevec


# Calc

In [41]:

corpus = 'cejc'
sen_type = 'query'
diff_type = 'add'
percentage = 70
rm_mincount = 0.2

sentencevec,t_sentencevec,r_sentencevec = get_senvecs(corpus,'apology',sen_type,diff_type,percentage,top=True,rm_mincount=rm_mincount,remove=False,tfidf=True)
t_sim = calc_score(sentencevec,t_sentencevec)
r_sim = calc_score(sentencevec,r_sentencevec)
sim_df = pd.DataFrame([np.mean(t_sim,axis=1),np.mean(r_sim,axis=1)]).T.set_axis(["Machine Translated","Human Translated"],axis='columns')
sim_df['diff'] = sim_df["Human Translated"]-sim_df["Machine Translated"]
# print(sim_df)
print(sim_df['diff'].describe())

sentencevec,t_sentencevec,r_sentencevec = get_senvecs(corpus,'apology',sen_type,diff_type,percentage,top=True,rm_mincount=rm_mincount,remove=True,tfidf=True)
t_sim = calc_score(sentencevec,t_sentencevec)
r_sim = calc_score(sentencevec,r_sentencevec)
sim_df = pd.DataFrame([np.mean(t_sim,axis=1),np.mean(r_sim,axis=1)]).T.set_axis(["Machine Translated","Human Translated"],axis='columns')
sim_df['diff'] = sim_df["Human Translated"]-sim_df["Machine Translated"]
# print(sim_df)
print(sim_df['diff'].describe())
print()

sentencevec,t_sentencevec,r_sentencevec = get_senvecs(corpus,'request',sen_type,diff_type,percentage,top=True,rm_mincount=rm_mincount,remove=False,tfidf=True)
t_sim = calc_score(sentencevec,t_sentencevec)
r_sim = calc_score(sentencevec,r_sentencevec)
sim_df = pd.DataFrame([np.mean(t_sim,axis=1),np.mean(r_sim,axis=1)]).T.set_axis(["Machine Translated","Human Translated"],axis='columns')
sim_df['diff'] = sim_df["Human Translated"]-sim_df["Machine Translated"]
# print(sim_df)
print(sim_df['diff'].describe())

sentencevec,t_sentencevec,r_sentencevec = get_senvecs(corpus,'request',sen_type,diff_type,percentage,top=True,rm_mincount=rm_mincount,remove=True,tfidf=True)
t_sim = calc_score(sentencevec,t_sentencevec)
r_sim = calc_score(sentencevec,r_sentencevec)
sim_df = pd.DataFrame([np.mean(t_sim,axis=1),np.mean(r_sim,axis=1)]).T.set_axis(["Machine Translated","Human Translated"],axis='columns')
sim_df['diff'] = sim_df["Human Translated"]-sim_df["Machine Translated"]
# print(sim_df)
print(sim_df['diff'].describe())
print()

sentencevec,t_sentencevec,r_sentencevec = get_senvecs(corpus,'thanksgiving',sen_type,diff_type,percentage,top=True,rm_mincount=rm_mincount,remove=False,tfidf=True)
t_sim = calc_score(sentencevec,t_sentencevec)
r_sim = calc_score(sentencevec,r_sentencevec)
sim_df = pd.DataFrame([np.mean(t_sim,axis=1),np.mean(r_sim,axis=1)]).T.set_axis(["Machine Translated","Human Translated"],axis='columns')
sim_df['diff'] = sim_df["Human Translated"]-sim_df["Machine Translated"]
# print(sim_df)
print(sim_df['diff'].describe())

sentencevec,t_sentencevec,r_sentencevec = get_senvecs(corpus,'thanksgiving',sen_type,diff_type,percentage,top=True,rm_mincount=rm_mincount,remove=True,tfidf=True)
t_sim = calc_score(sentencevec,t_sentencevec)
r_sim = calc_score(sentencevec,r_sentencevec)
sim_df = pd.DataFrame([np.mean(t_sim,axis=1),np.mean(r_sim,axis=1)]).T.set_axis(["Machine Translated","Human Translated"],axis='columns')
sim_df['diff'] = sim_df["Human Translated"]-sim_df["Machine Translated"]
# print(sim_df)
print(sim_df['diff'].describe())

cl-tohoku/bert-base-japanese-whole-word-masking
mpdd apology add
translated[0]:  孫校長さん、こんにちは、本当に申し訳ありません、先に帰ります、私が直接お願いしたわけではありません、許してください、もう紫江を出ましたか？
rewrited[0]:    もしもし、校長先生ですか？　本当にすみません。お先に失礼いたしました。事前にちゃんと申請せずに休んでしまって本当に申し訳ございません。もう芷江をお出になりましたか？
count    34.000000
mean      0.013329
std       0.032308
min      -0.038637
25%      -0.008268
50%       0.007928
75%       0.036039
max       0.099160
Name: diff, dtype: float64
cl-tohoku/bert-base-japanese-whole-word-masking
mpdd apology add
tfidf True
       word diff count
0        んだ     2.2192
1    ごめんなさい     1.6445
2        お前    1.50489
3        なる    1.38679
4     申し訳ない    1.37623
..      ...        ...
260      こと   -0.67305
261     なれる  -0.678461
262      ます  -0.828039
263      ない   -0.98651
264      する   -1.56429

[265 rows x 2 columns]
translated[0]:  孫校長さん、こんにちは、本当に申し訳ありません、先に帰ります、私が直接お願いしたわけではありません、許してください、もう紫江を出ましたか？
rewrited[0]:    もしもし、先生ですか？　お先に失礼いたしました事前にせずにしまって申し訳ございませ芷江をお出にましたか？
count    34.000000
mean     -0.030831
st