In [8]:
#data_dir = 'data/nested'
data_dir = 'data/nested_sample'

In [9]:
import pandas as pd
import numpy as np
from collections import Counter
from pandarallel import pandarallel
import MeCab
import oseti
from transformers import BertJapaneseTokenizer

tokenizer = BertJapaneseTokenizer.from_pretrained('cl-tohoku/bert-large-japanese', additional_special_tokens=['<person>'])
mecab_args = "-r /home/haoki/Documents/vscode-workplaces/lie_detector/project/tokenizer/mecab_userdic/mecabrc -d /usr/local/lib/mecab/dic/mecab-ipadic-neologd"

hedges = ['かも', 'かもしれん','そうかもしれない','かな','なのかな','あるかな','そうかな','っぽい','たぶん','多分','恐らく','めいた','ちょっと','すこし','少し']
self_references = ['私', '私自身','私的','私たち','私達','私見','私は羊','私について','私の主張','私事','私のように','私の部屋','実は私は','私ゃ','明日の私','私が死んでも','私どう','僕','僕自身','僕ら','僕たち','僕のこと','僕は思う','僕達','僕は君','下僕','僕が悪い','僕の話','老僕','ぼく','ぼくが','ぼくら','わたし','わたしゃ','わたしたち','俺','俺様','おれ','おれっち','あたし','あたしゃ','わし','わしゃ']
cognitive_words = ['おもう', '思う','思うに','考え','考える','かんがえる','かんがえ','かんがえなおす']


def feature_counter(nested_df):
    pandarallel.initialize()
    nested_df.loc[:, 'nested_utters'] = nested_df.loc[:, 'nested_utters'].parallel_apply(_feature_counter)
    col_names = ['num_morphemes', 'num_subwords', 'positive', 'negative', 'polarity', 'self_ref', 'cognitive', 'hedges', 'noun', 'verb', 'adjective', 'conjunction', 'particle']
    nested_df = player_feature_count(nested_df, col_names)
    return nested_df

def _feature_counter(df: pd.DataFrame):
    tagger = MeCab.Tagger(mecab_args)
    analyzer = oseti.Analyzer(mecab_args)
    parsed_utters_mecab = df.loc[:, 'raw_nested_utters'].apply(tagger.parse)
    parsed_utters_wordpiece = df.loc[:, 'raw_nested_utters'].apply(tokenizer.tokenize)
    count = {'num_morphemes': [], 'num_subwords': [], 'positive': [], 'negative': [], 'polarity': [], 'self_ref': [], 'cognitive': [], 'hedges': [], 'noun': [], 'verb': [], 'adjective': [], 'conjunction': [], 'particle': []}
    for (_, line), (_, parsed_line_mecab), parsed_line_wordpiece in zip(df.loc[:, 'raw_nested_utters'].items(), parsed_utters_mecab.items(), parsed_utters_wordpiece):
        polarity = analyzer.count_polarity(line) # 東北大の極性辞書を使って、polarityを計算
        count['positive'].append(sum([pol['positive'] for pol in polarity]))
        count['negative'].append(sum([pol['negative'] for pol in polarity]))
        matched_total = sum([pol['positive'] + pol['negative'] for pol in polarity])
        count['polarity'].append(sum([pol['positive'] - pol['negative'] for pol in polarity]) / matched_total if matched_total != 0 else 0) # average

        count['num_morphemes'].append(len(parsed_line_mecab.split('\n')) - 1) # exclude EOS
        count['num_subwords'].append(len(parsed_line_wordpiece))
        mors = [mor.split("\t")[0] for mor in parsed_line_mecab.split("\n")[:-2]] # 形態素のリスト。EOSトークン除外
        count['self_ref'].append(len([mor for mor in mors if mor in self_references]))
        count['cognitive'].append(len([mor for mor in mors if mor in cognitive_words]))
        count['hedges'].append(len([mor for mor in mors if mor in hedges]))

        pos = [mor.split("\t")[1].split(",")[0] for mor in parsed_line_mecab.split("\n")[:-2]] # 品詞をカウント
        pos_count = Counter(pos)
        count['noun'].append(pos_count['名詞'])
        count['verb'].append(pos_count['動詞'])
        count['adjective'].append(pos_count['形容詞'])
        count['conjunction'].append(pos_count['接続詞'])
        count['particle'].append(pos_count['助詞'])
    return pd.concat((df, pd.DataFrame(count)), axis=1)

def sum_mean_std(df, col_name):
    return (
        [np.sum(nested_utters_df.loc[:,col_name]) for nested_utters_df in df.loc[:,'nested_utters']],
        [np.mean(nested_utters_df.loc[:,col_name]) for nested_utters_df in df.loc[:,'nested_utters']],
        [np.std(nested_utters_df.loc[:,col_name]) for nested_utters_df in df.loc[:,'nested_utters']]
    )

def player_feature_count(df, col_names):
    for col_name in col_names:
        df[f"{col_name}_sum"], df[f"{col_name}_mean"], df[f"{col_name}_std"] = sum_mean_std(df, col_name)
    return df

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [10]:
train = pd.read_pickle(f'{data_dir}/train.pkl')
valid = pd.read_pickle(f'{data_dir}/valid.pkl')
test = pd.read_pickle(f'{data_dir}/test.pkl')

train = feature_counter(train)
valid = feature_counter(valid)
test = feature_counter(test)

INFO: Pandarallel will run on 112 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
INFO: Pandarallel will run on 112 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
INFO: Pandarallel will run on 112 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [11]:
train.head()

Unnamed: 0,nested_utters,num_utters,labels,num_morphemes_sum,num_morphemes_mean,num_morphemes_std,num_subwords_sum,num_subwords_mean,num_subwords_std,positive_sum,...,verb_std,adjective_sum,adjective_mean,adjective_std,conjunction_sum,conjunction_mean,conjunction_std,particle_sum,particle_mean,particle_std
0,raw_nested...,80,1,4660,58.25,25.791714,5048,63.1,28.324724,111,...,4.125833,65,0.8125,1.096514,40,0.5,0.758288,1134,14.175,7.692163
1,raw_nested...,53,0,2594,48.943396,17.726874,2939,55.45283,19.505748,64,...,3.225036,54,1.018868,1.018518,15,0.283019,0.527628,729,13.754717,6.024809
2,raw_nested...,71,0,5284,74.422535,30.388407,5780,81.408451,33.795398,148,...,5.394201,87,1.225352,1.177215,40,0.56338,0.81763,1576,22.197183,9.684632
3,raw_nested...,55,0,3690,67.090909,27.254539,4135,75.181818,29.7487,103,...,4.765787,75,1.363636,1.353294,12,0.218182,0.49326,997,18.127273,8.946415
4,raw_nested...,32,0,2459,76.84375,29.273612,2873,89.78125,33.854223,59,...,6.066606,70,2.1875,2.156929,17,0.53125,0.749349,661,20.65625,9.926635


In [12]:
def calc_stats(nested_df):
    player_num = len(nested_df)
    civil_num = nested_df.loc[:,'labels'].value_counts()[0]
    werewolf_num = nested_df.loc[:,'labels'].value_counts()[1]
    sum_utter_num = nested_df.loc[:,'num_utters'].sum()
    ave_utter_num = nested_df.loc[:,'num_utters'].mean()
    max_utter_num = nested_df.loc[:,'num_utters'].max()
    min_utter_num = nested_df.loc[:,'num_utters'].min()
    sum_mor_num = nested_df.loc[:,'num_morphemes_sum'].sum()
    ave_mor_num_per_player = nested_df.loc[:,'num_morphemes_sum'].mean()
    max_mor_num_per_player = nested_df.loc[:,'num_morphemes_sum'].max()
    min_mor_num_per_player = nested_df.loc[:,'num_morphemes_sum'].min()
    ave_mor_num_per_utter = sum_mor_num / sum_utter_num
    max_mor_num_per_utter = max([df.loc[:,'num_morphemes'].max() for df in nested_df.loc[:,'nested_utters']])
    min_mor_num_per_utter = min([df.loc[:,'num_morphemes'].min() for df in nested_df.loc[:,'nested_utters']])
    sum_sub_num = nested_df.loc[:,'num_subwords_sum'].sum()
    ave_sub_num_per_player = nested_df.loc[:,'num_subwords_sum'].mean()
    max_sub_num_per_player = nested_df.loc[:,'num_subwords_sum'].max()
    min_sub_num_per_player = nested_df.loc[:,'num_subwords_sum'].min()
    ave_sub_num_per_utter = sum_sub_num / sum_utter_num
    max_sub_num_per_utter = max([df.loc[:,'num_subwords'].max() for df in nested_df.loc[:,'nested_utters']])
    min_sub_num_per_utter = min([df.loc[:,'num_subwords'].min() for df in nested_df.loc[:,'nested_utters']])

    return [f'{player_num:,}({civil_num:,}/{werewolf_num:,})', sum_utter_num, ave_utter_num, max_utter_num, min_utter_num, sum_mor_num, ave_mor_num_per_player, max_mor_num_per_player, min_mor_num_per_player, ave_mor_num_per_utter, max_mor_num_per_utter, min_mor_num_per_utter, sum_sub_num, ave_sub_num_per_player, max_sub_num_per_player, min_sub_num_per_player, ave_sub_num_per_utter, max_sub_num_per_utter, min_sub_num_per_utter]


def make_stats_table(train, valid, test):
    index = ['プレイヤー数(人狼/市民)', '合計発話数', '平均発話数', '最大発話数', '最小発話数', 
                '形態素数合計', '1プレイヤーにおける平均形態素数', '1プレイヤーにおける最大形態素数', '1プレイヤーにおける最小形態素数', '1発話における平均形態素数', '1発話における最大形態素数', '1発話における最小形態素数', 
                'サブワード数合計', '1プレイヤーにおける平均サブワード数', '1プレイヤーにおける最大サブワード数', '1プレイヤーにおける最小サブワード数', '1発話における平均サブワード数', '1発話における最大サブワード数', '1発話における最小サブワード数']
    train_row = calc_stats(train)
    valid_row = calc_stats(valid)
    test_row = calc_stats(test)
    stats_table = pd.DataFrame({'train': train_row, 'valid': valid_row, 'test': test_row}, index=index)
    stats_table.iloc[1:] = stats_table.iloc[1:].applymap(lambda x: f'{x:,.2f}')

    return stats_table

In [13]:
stats_table = make_stats_table(train, valid, test)
display(stats_table)
stats_table.to_csv(f'{data_dir}/stats_mecab.csv')

Unnamed: 0,train,valid,test
プレイヤー数(人狼/市民),42(21/21),6(3/3),6(3/3)
合計発話数,3555.00,255.00,558.00
平均発話数,84.64,42.50,93.00
最大発話数,134.00,94.00,116.00
最小発話数,32.00,17.00,60.00
形態素数合計,252024.00,16352.00,40035.00
1プレイヤーにおける平均形態素数,6000.57,2725.33,6672.50
1プレイヤーにおける最大形態素数,11455.00,5183.00,7478.00
1プレイヤーにおける最小形態素数,2459.00,1096.00,5520.00
1発話における平均形態素数,70.89,64.13,71.75


In [None]:
train = train.drop('nested_utters', axis=1)
display(train.corr().style.background_gradient(axis=None))

Unnamed: 0,num_utters,labels,num_morphemes_sum,num_morphemes_mean,num_morphemes_std,positive_sum,positive_mean,positive_std,negative_sum,negative_mean,negative_std,polarity_sum,polarity_mean,polarity_std,self_ref_sum,self_ref_mean,self_ref_std,cognitive_sum,cognitive_mean,cognitive_std,hedges_sum,hedges_mean,hedges_std,noun_sum,noun_mean,noun_std,verb_sum,verb_mean,verb_std,adjective_sum,adjective_mean,adjective_std,conjunction_sum,conjunction_mean,conjunction_std,particle_sum,particle_mean,particle_std
num_utters,1.0,0.023807,0.919486,0.369049,0.009213,0.859071,0.279128,0.240548,0.855104,0.357629,0.309901,-0.117999,-0.130795,-0.216722,0.562852,0.112114,0.129217,0.733765,0.233936,0.250403,0.68272,0.129678,0.179204,0.896197,0.359044,0.128977,0.906192,0.337808,0.145448,0.839064,0.167613,0.148665,0.795903,0.261337,0.28007,0.907358,0.333518,0.045241
labels,0.023807,1.0,0.020607,0.020526,0.038858,0.00279,-0.012305,0.005087,0.023261,0.02989,0.024221,-0.078694,-0.077236,-0.013933,0.02369,0.022261,0.029433,0.03797,0.033893,0.042478,0.061238,0.067654,0.066431,0.002036,-0.018129,-0.017131,0.034179,0.050077,0.054068,0.020265,0.014525,0.02359,0.034937,0.037755,0.037314,0.028886,0.035571,0.038318
num_morphemes_sum,0.919486,0.020607,1.0,0.660109,-0.088811,0.960229,0.539585,0.402402,0.955205,0.602853,0.453694,-0.129247,-0.127814,-0.442704,0.581327,0.177319,0.171787,0.798091,0.375871,0.352894,0.734178,0.248992,0.277245,0.988003,0.638317,0.118686,0.982722,0.603808,0.147249,0.920568,0.395072,0.295116,0.836711,0.39201,0.376031,0.987385,0.60841,-0.010204
num_morphemes_mean,0.369049,0.020526,0.660109,1.0,-0.114838,0.673403,0.863338,0.621363,0.657685,0.850296,0.606951,-0.060706,-0.053194,-0.704272,0.338097,0.240382,0.197752,0.524768,0.520513,0.454003,0.477191,0.406428,0.397136,0.668297,0.953236,0.150626,0.647634,0.923937,0.215471,0.624776,0.709875,0.509615,0.509789,0.516321,0.462027,0.65596,0.945463,0.022011
num_morphemes_std,0.009213,0.038858,-0.088811,-0.114838,1.0,-0.079932,-0.067359,0.240498,-0.08386,-0.069323,0.204113,0.015195,-0.004785,0.07979,-0.051204,-0.048636,0.020914,-0.076719,-0.059066,0.040957,-0.063934,-0.047513,0.009928,-0.088921,-0.100405,0.796569,-0.081966,-0.091971,0.729485,-0.083268,-0.08898,0.140543,-0.087075,-0.095473,0.007911,-0.093413,-0.115935,0.888153
positive_sum,0.859071,0.00279,0.960229,0.673403,-0.079932,1.0,0.677221,0.534348,0.95074,0.658344,0.502322,-0.006134,-0.021742,-0.518905,0.55463,0.177833,0.176039,0.775094,0.39141,0.36158,0.694358,0.245742,0.272311,0.962789,0.673437,0.153659,0.947439,0.622922,0.167645,0.905717,0.443859,0.337948,0.781701,0.367072,0.352404,0.949617,0.624184,-0.000454
positive_mean,0.279128,-0.012305,0.539585,0.863338,-0.067359,0.677221,1.0,0.801859,0.586986,0.818683,0.61188,0.160948,0.182754,-0.737092,0.27106,0.200074,0.176507,0.443499,0.471827,0.408501,0.375694,0.336038,0.329626,0.567619,0.866925,0.205194,0.535017,0.808803,0.231237,0.545879,0.682966,0.510889,0.383194,0.38905,0.349943,0.538485,0.821084,0.048664
positive_std,0.240548,0.005087,0.402402,0.621363,0.240498,0.534348,0.801859,1.0,0.444721,0.616685,0.561882,0.097126,0.101648,-0.491358,0.191768,0.120008,0.129589,0.327722,0.341142,0.330988,0.272986,0.231869,0.24983,0.428329,0.641556,0.42301,0.397135,0.579314,0.392223,0.403527,0.485561,0.447622,0.278716,0.260563,0.264858,0.39726,0.581875,0.302384
negative_sum,0.855104,0.023261,0.955205,0.657685,-0.08386,0.95074,0.586986,0.444721,1.0,0.735491,0.580622,-0.276639,-0.245313,-0.515986,0.546862,0.169455,0.169113,0.76842,0.379892,0.350241,0.687746,0.235772,0.261846,0.958895,0.659232,0.146539,0.940448,0.605682,0.150755,0.892812,0.419644,0.314853,0.777365,0.360463,0.345502,0.942533,0.605885,-0.016943
negative_mean,0.357629,0.02989,0.602853,0.850296,-0.069323,0.658344,0.818683,0.616685,0.735491,1.0,0.800886,-0.323336,-0.325536,-0.736033,0.304446,0.192067,0.172942,0.490165,0.462823,0.403184,0.417853,0.320046,0.318646,0.629445,0.854203,0.199986,0.595234,0.794144,0.213565,0.589114,0.634611,0.466086,0.441006,0.396713,0.3607,0.597885,0.801401,0.022959


In [None]:
import pandas_profiling
pandas_profiling.ProfileReport(train)

Summarize dataset:   0%|          | 0/51 [00:00<?, ?it/s]