In [5]:
import os
import csv
import pandas as pd
from pprint import pprint
import numpy as np
import scipy.stats as stats
import matplotlib as plt
import matplotlib.style
import itertools
from collections import Counter
jiwc = {}
files_path = [  '../sentiment_analysis/JIWC_Dictionary/ver_2018/JIWC-C_2018.csv',
                '../sentiment_analysis/JIWC_Dictionary/ver_2019/JIWC-C_2019.csv',
                '../sentiment_analysis/JIWC_Dictionary/JIWC-C_2018_2019.csv']
for path in files_path:
    with open(path,'r') as f:
        reader = csv.reader(f)
        for i,row in enumerate(reader): 
            if i == 0 :
                continue
            jiwc.setdefault(row[0],'Sadness')
            jiwc.setdefault(row[1],'Anxiety')
            jiwc.setdefault(row[2],'Anger')
            jiwc.setdefault(row[3],'Disgust')
            jiwc.setdefault(row[4],'Trust')
            jiwc.setdefault(row[5],'Surprise')
            jiwc.setdefault(row[6],'Joy')
# pprint(jiwc)

def extract_words_by_emotion(unaligned, emotion, stop_words):
    emo_words = []
    for i , sen in enumerate(unaligned):
        tmp = []
        for j, w in enumerate(sen):
            if w not in stop_words:
                if w in jiwc:
                    if jiwc[w]==emotion:
                        tmp.append(w)
        emo_words.append(tmp)
    return emo_words

def get_data_as_list(fpath) -> list:
    data = []
    with open(fpath, "r", encoding="utf-8-sig") as f:
        for line in f:
            line = line.replace("\n","").split(" ")
            data.append(line)
    return data

def get_unaligned_mrphwords(unaligned_index,mrph):
    unaligned_mrphwords = []
    for i, index in enumerate(unaligned_index):
        tmp = []
        for indice in index:
            if indice == "":
                continue
            try:
                tmp.append(mrph[int(i)][int(indice)])
            except:
                print('index: ',index)
                print('indice: ',indice)
                print('len(mrph[int(i)]): ',len(mrph[int(i)]))
                print(mrph[int(i)])
        unaligned_mrphwords.append(tmp)
    return unaligned_mrphwords


def get_ranking(corpus,situation,sen_type,emotion,diff_type):
    stop_words=["の","に","こと","",""]
    
    method = 'rewrited'

    unaligned_index_fpath= '../analysis/unaligned_index/' + f"{corpus}/{situation}/{method}_{sen_type}.{diff_type}"
    ref_method='original' if diff_type=='del' else method
    mrph_fpath= '../mrphdata/'+ f"{corpus}/{situation}/{ref_method}_{sen_type}"
    unaligned_index = get_data_as_list(unaligned_index_fpath)
    rewrited_mrph = get_data_as_list(mrph_fpath)
    rewrited_unaligned = get_unaligned_mrphwords(unaligned_index,rewrited_mrph)


    method = 'translated'  

    unaligned_index_fpath= '../analysis/unaligned_index/' + f"{corpus}/{situation}/{method}_{sen_type}.{diff_type}"
    ref_method='original' if diff_type=='del' else method
    mrph_fpath= '../mrphdata/'+ f"{corpus}/{situation}/{ref_method}_{sen_type}"
    unaligned_index = get_data_as_list(unaligned_index_fpath)
    translated_mrph = get_data_as_list(mrph_fpath)
    translated_unaligned = get_unaligned_mrphwords(unaligned_index,translated_mrph)

    translated_emo_words = extract_words_by_emotion(translated_unaligned,emotion,stop_words)
    # translated_emo_words
    rewrited_emo_words = extract_words_by_emotion(rewrited_unaligned,emotion,stop_words)
    # rewrited_emo_words
    
    words = translated_emo_words
    words = itertools.chain.from_iterable(words)
    words = Counter(words)
    words = words.most_common()
    mt_words, mt_val = [],[]
    for w,v in words:
        mt_words.append(w)
        mt_val.append(v)
        
    words = rewrited_emo_words
    words = itertools.chain.from_iterable(words)
    words = Counter(words)
    words = words.most_common()
    ht_words, ht_val = [],[]
    for w,v in words:
        ht_words.append(w)
        ht_val.append(v)
        
    dic = {}
    for w,v in (zip(ht_words, ht_val)):     
        if w in dic:
            dic[w]+=v
        else:
            dic.setdefault(w,v)
    for w,v in (zip(mt_words, mt_val)):     
        if w in dic:
            v = v*-1 
            dic[w]+=v
        else:
            v = v*-1
            dic.setdefault(w,v)           
    dic = sorted(dic.items(),key=lambda x:x[1],reverse=True)
#     print(dic)
    words, diff_val=[],[]
    for key, values in dic:
        words.append(key)
        diff_val.append(values)

    data = [mt_words, mt_val, ht_words, ht_val, words, diff_val]
    return data


|diff type|corpus|sen type|situation|emotion|
|---|---|---|---|---|
|del|cejc|query|apology||
|del|cejc|query|request|Trust|
|del|cejc|query|thanksgiving|Trust|
|del|cejc|res|apology||
|del|cejc|res|request|Trust|
|del|cejc|res|thanksgiving||
|add|mpdd|query|apology|Disgust|
|add|mpdd|query|request|Sadness, Disgust, Joy|
|add|mpdd|query|thanksgiving|Sadness, Disgust, Trust, Joy|
|add|mpdd|res|apology||
|add|mpdd|res|request|Sadness, Disgust, Trust, Joy|
|add|mpdd|res|thanksgiving|Sadness|

In [9]:
sig_list=[   ['del','cejc','query','request','Trust'],
                ['del','cejc','query','thanksgiving','Trust'],
                ['del','cejc','res','request','Trust'],
                ['add','mpdd','query','apology','Disgust'],
                ['add','mpdd','query','request','Sadness'], 
                ['add','mpdd','query','request','Disgust'],
                ['add','mpdd','query','request','Joy'],
                ['add','mpdd','query','thanksgiving','Sadness'],
                ['add','mpdd','query','thanksgiving','Disgust'],
                ['add','mpdd','query','thanksgiving','Trust'], 
                ['add','mpdd','query','thanksgiving','Joy'],
                ['add','mpdd','res','request','Sadness'],
                ['add','mpdd','res','request','Disgust'],
                ['add','mpdd','res','request','Trust'],
                ['add','mpdd','res','request','Joy'],
                ['add','mpdd','res','thanksgiving','Sadness']]
# zh_sig_list=[   ['del',	'mpdd',	'query',	'request',		'affect'],
#                 ['del',	'mpdd',	'query',	'request',		'negemo'],
#                 ['del',	'mpdd',	'query',	'request',		'anger'],
#                 ['del',	'mpdd',	'res',	'thanksgiving',	'affect'],
#                 ['add',	'cejc',	'query',	'apology',	    'affect'],
#                 ['add',	'cejc',	'query',	'apology',	    'posemo'],
#                 ['add',	'cejc',	'query',	'apology',	    'negemo'],
#                 ['add',	'cejc',	'query',	'apology',	    'anger'],
#                 ['add',	'cejc',	'query',	'request',	    'negemo'],
#                 ['add',	'cejc',	'res',	'request',	    'affect'],
#                 ['add',	'cejc',	'res',	'request',	    'posemo']]
# corpus,situation,sentence_type,count_type,unalignment_type,ranking
###################################################################
###################################################################
data = []
l_name = []
c_name = []
s_name = []
t_name = []
m_name = []
d_name = []
r_name = []
e_name = []
for s in sig_list:
    unalignment_type=s[0]
    corpus=s[1]
    sentence_type=s[2]
    situation=s[3]
    emotion=s[4]

    if unalignment_type == 'del':
        if corpus == 'cejc':
            language = 'ch'
        else:
            language = 'jp'
    else:
        if corpus == 'cejc':
            language = 'jp'
        else:
            language = 'ch'

    tmp=get_ranking(corpus,situation,sentence_type,emotion,unalignment_type)
    for i in range(6):
        l_name.append(f'{language}')
        c_name.append(f'{corpus}')
        s_name.append(f'{situation}')
        t_name.append(f'{sentence_type}')
        d_name.append(f'{unalignment_type}')
        e_name.append(f'{emotion}')
    m_name.extend(['MT','MT','HT','HT','Diff','Diff'])
    r_name.extend(['word','freq','word','freq','word','freq'])
    data.extend(tmp)
    
tuples = list(zip(l_name,c_name,s_name,t_name,m_name,d_name,e_name,r_name))
index = pd.MultiIndex.from_tuples(tuples, names=["language","corpus", "situation","sentence_type","count_type","unalignment_type","emotion","ranking",])
table = pd.DataFrame(data,index=index)

dir_name = 'for_thesis/LIWC_word_ranking/'
os.makedirs(dir_name,exist_ok = True) 
table.to_csv(dir_name+'JIWC.csv',encoding='utf_8_sig')

In [17]:
# df = pd.read_csv(dir_name+'JIWC.csv')
# df = df[df['ranking'].str.contains('freq')]
# df


her


# CLIWC

In [33]:
import re
import csv
import numpy as np
import pandas as pd
from pprint import pprint
from inlp.convert import chinese
import scipy.stats as stats
import matplotlib as plt
import matplotlib.style

fpath = '../sentiment_analysis/CLIWC_Dictionary/cliwc2015 v1.4.3.dic'

category_dict = {}
cliwc = {}
with open(fpath, "r", encoding="utf-8-sig") as f:
    for i,line in enumerate(f):
        line = line.lstrip("\t*")
        line = re.sub(r"\(.+\)","",line)
        line = line.rstrip()
        line = line.replace("\n","").split("\t")
        if (i >= 1) and (i <= 79):
            category_dict.setdefault(line[0],line[1])
        if (i >= 81):
            line[0] = chinese.t2s(line[0]) 
            cliwc.setdefault(line[0],line[1:])

def extract_words_by_emotion_(unaligned, emotion, stop_words):
    emo_words = []
    for i , sen in enumerate(unaligned):
        tmp = []
        for j, w in enumerate(sen): 
            if w not in stop_words:
                if w in cliwc:
                    for key in cliwc[w]:
                        if category_dict[key] == emotion:
                            tmp.append(w)
                # Bigram
                if j>=1:
                    w = sen[j-1]+sen[j]
                    if w in cliwc:
                        for key in cliwc[w]:
                            if category_dict[key] == emotion:
                                tmp.append(w)
                # Trigram
                if j>=2:
                    w = sen[j-2]+sen[j-1]+sen[j]
                    if w in cliwc:
                        for key in cliwc[w]:
                            if category_dict[key] == emotion:
                                tmp.append(w)
                # quadgram
                if j>=3:
                    w = sen[j-3]+sen[j-2]+sen[j-1]+sen[j]
                    if w in cliwc:
                        for key in cliwc[w]:
                            if category_dict[key] == emotion:
                                tmp.append(w)
        emo_words.append(tmp)
    return emo_words   

def get_ranking_(corpus,situation,sen_type,emotion,diff_type):
    stop_words=[]
    
    method = 'rewrited'

    unaligned_index_fpath= '../analysis/unaligned_index/' + f"{corpus}/{situation}/{method}_{sen_type}.{diff_type}"
    ref_method='original' if diff_type=='del' else method
    mrph_fpath= '../mrphdata/'+ f"{corpus}/{situation}/{ref_method}_{sen_type}"
    unaligned_index = get_data_as_list(unaligned_index_fpath)
    rewrited_mrph = get_data_as_list(mrph_fpath)
    rewrited_unaligned = get_unaligned_mrphwords(unaligned_index,rewrited_mrph)


    method = 'translated'  

    unaligned_index_fpath= '../analysis/unaligned_index/' + f"{corpus}/{situation}/{method}_{sen_type}.{diff_type}"
    ref_method='original' if diff_type=='del' else method
    mrph_fpath= '../mrphdata/'+ f"{corpus}/{situation}/{ref_method}_{sen_type}"
    unaligned_index = get_data_as_list(unaligned_index_fpath)
    translated_mrph = get_data_as_list(mrph_fpath)
    translated_unaligned = get_unaligned_mrphwords(unaligned_index,translated_mrph)
#     print(rewrited_mrph[:5])
#     print(translated_mrph[:5])
#     print(rewrited_unaligned[:5])
#     print(translated_unaligned[:5])

    translated_emo_words = extract_words_by_emotion_(translated_unaligned,emotion,stop_words)
    rewrited_emo_words = extract_words_by_emotion_(rewrited_unaligned,emotion,stop_words)
#     print(translated_emo_words[:5])
    # rewrited_emo_words
    # translated_emo_words
    words = translated_emo_words
    words = itertools.chain.from_iterable(words)
    words = Counter(words)
    words = words.most_common()
    mt_words, mt_val = [],[]
    for w,v in words:
        mt_words.append(w)
        mt_val.append(v)

    words = rewrited_emo_words
    words = itertools.chain.from_iterable(words)
    words = Counter(words)
    words = words.most_common()
    ht_words, ht_val = [],[]
    for w,v in words:
        ht_words.append(w)
        ht_val.append(v)

    dic = {}
    for w,v in (zip(ht_words, ht_val)):     
        if w in dic:
            dic[w]+=v
        else:
            dic.setdefault(w,v)
    for w,v in (zip(mt_words, mt_val)):     
        if w in dic:
            v = v*-1 
            dic[w]+=v
        else:
            v = v*-1
            dic.setdefault(w,v)           
    dic = sorted(dic.items(),key=lambda x:x[1],reverse=True)
#     print(dic)
    words, diff_val=[],[]
    for key, values in dic:
        words.append(key)
        diff_val.append(values)

    data = [mt_words, mt_val, ht_words, ht_val, words, diff_val]
    return data

In [34]:

# sig_list=[   ['del','cejc','query','request','Trust'],
#                 ['del','cejc','query','thanksgiving','Trust'],
#                 ['del','cejc','res','request','Trust'],
#                 ['add','mpdd','query','apology','Disgust'],
#                 ['add','mpdd','query','request','Sadness'], 
#                 ['add','mpdd','query','request','Disgust'],
#                 ['add','mpdd','query','request','Joy'],
#                 ['add','mpdd','query','thanksgiving','Sadness'],
#                 ['add','mpdd','query','thanksgiving','Disgust'],
#                 ['add','mpdd','query','thanksgiving','Trust'], 
#                 ['add','mpdd','query','thanksgiving','Joy'],
#                 ['add','mpdd','res','request','Sadness'],
#                 ['add','mpdd','res','request','Disgust'],
#                 ['add','mpdd','res','request','Trust'],
#                 ['add','mpdd','res','request','Joy'],
#                 ['add','mpdd','res','thanksgiving','Sadness']]
sig_list=[   ['del',	'mpdd',	'query',	'request',		'affect'],
                ['del',	'mpdd',	'query',	'request',		'negemo'],
                ['del',	'mpdd',	'query',	'request',		'anger'],
                ['del',	'mpdd',	'res',	'thanksgiving',	'affect'],
                ['add',	'cejc',	'query',	'apology',	    'affect'],
                ['add',	'cejc',	'query',	'apology',	    'posemo'],
                ['add',	'cejc',	'query',	'apology',	    'negemo'],
                ['add',	'cejc',	'query',	'apology',	    'anger'],
                ['add',	'cejc',	'query',	'request',	    'negemo'],
                ['add',	'cejc',	'res',	'request',	    'affect'],
                ['add',	'cejc',	'res',	'request',	    'posemo']]
# corpus,situation,sentence_type,count_type,unalignment_type,ranking
###################################################################
###################################################################
data = []
l_name = []
c_name = []
s_name = []
t_name = []
m_name = []
d_name = []
r_name = []
e_name = []
for s in sig_list:
    unalignment_type=s[0]
    corpus=s[1]
    sentence_type=s[2]
    situation=s[3]
    emotion=s[4]

    if unalignment_type == 'del':
        if corpus == 'cejc':
            language = 'jp'
        else:
            language = 'ch'
    else:
        if corpus == 'cejc':
            language = 'ch'
        else:
            language = 'jp'

    tmp=get_ranking_(corpus,situation,sentence_type,emotion,unalignment_type)
    for i in range(6):
        l_name.append(f'{language}')
        c_name.append(f'{corpus}')
        s_name.append(f'{situation}')
        t_name.append(f'{sentence_type}')
        d_name.append(f'{unalignment_type}')
        e_name.append(f'{emotion}')
    m_name.extend(['MT','MT','HT','HT','Diff','Diff'])
    r_name.extend(['word','freq','word','freq','word','freq'])
    data.extend(tmp)
    
tuples = list(zip(l_name,c_name,s_name,t_name,m_name,d_name,e_name,r_name))
index = pd.MultiIndex.from_tuples(tuples, names=["language","corpus", "situation","sentence_type","count_type","unalignment_type","emotion","ranking",])
table = pd.DataFrame(data,index=index)

dir_name = 'for_thesis/LIWC_word_ranking/'
os.makedirs(dir_name,exist_ok = True) 
table.to_csv(dir_name+'CLIWC.csv',encoding='utf_8_sig')

# Unaligned emotion word count and unique count

In [50]:
import pandas as pd
import os
table=pd.DataFrame()
df = pd.read_csv('CLIWC_diff_reason_table.csv', names=['diff_type','corpus','situation','sen_type','emotion','word','htmt','line','part','effect','direct','intense','perspective'])
df = df[df['htmt'].str.contains('HT')]

table=df.groupby(['diff_type','corpus','situation','sen_type','emotion'], as_index=False)['word'].count()
table=table.rename(columns={'word':'# of words'})

df = df.groupby(['diff_type','corpus','situation','sen_type','emotion'], as_index=False)['word'].nunique()
table = pd.merge(table,df, on=['diff_type','corpus','situation','sen_type','emotion'])
table=table.rename(columns={'word':'# unique of words'})


table_=pd.DataFrame()
df = pd.read_csv('JIWC_diff_reason_table.csv', names=['diff_type','corpus','situation','sen_type','emotion','word','htmt','line','part','effect','direct','intense','perspective'])
df = df[df['htmt'].str.contains('HT')]

table_=df.groupby(['diff_type','corpus','situation','sen_type','emotion'], as_index=False)['word'].count()
table_=table_.rename(columns={'word':'# of words'})

df = df.groupby(['diff_type','corpus','situation','sen_type','emotion'], as_index=False)['word'].nunique()
table_ = pd.merge(table_,df, on=['diff_type','corpus','situation','sen_type','emotion'])
table_=table_.rename(columns={'word':'# unique of words'})

table = pd.concat([table,table_])
table = table[['corpus','situation','sen_type','diff_type','emotion','# of words','# unique of words']]
table=table.set_axis(['corpus','situation','sentence_type','alignment_type','emotion','# of words','# unique of words'],axis=1)


dir_name = 'for_thesis/LIWC_word_ranking/'
os.makedirs(dir_name,exist_ok = True) 
table.to_csv(dir_name+'word_counts.csv',encoding='utf_8_sig')
