## pytorch-lightningで、multi processがスタートした時、pickle出来ないオブジェクトが存在したため、pickle可能にするために頑張った記録。

In [7]:
# -*- coding: utf-8 -*-
import pytorch_lightning as pl
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
from pytorch_lightning import loggers as pl_loggers
import pandas as pd
import glob
import argparse
from pathlib import Path
import sys, os
sys.path.append(os.pardir)
from model.HAN import HierAttnNet
from model.HANDataModule import CreateHANDataModule
from preprocess.tokenizer_HAN import HANtokenizer

In [2]:
N_EPOCHS = 20
batch_size = 16
vocab_size=32000
word_hidden_dim=32
sent_hidden_dim=32
padding_idx=1
embed_dim=200

tokenizer = HANtokenizer(vocab_size=vocab_size)

pl.seed_everything(111)

Global seed set to 111


111

In [9]:
model = HierAttnNet(vocab_size=vocab_size, word_hidden_dim=word_hidden_dim, sent_hidden_dim=sent_hidden_dim, padding_idx=padding_idx,
                        embed_dim=embed_dim, embedding_matrix=tokenizer.embedding_matrix
                    )

### tokenizerがpicklable出ないことに気が付いた。

In [3]:
import pickle
with open('tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)

### MeCabが問題じゃね思ったので試してみた。⇒やっぱりな..

In [14]:
import MeCab
wakati = MeCab.Tagger("-O wakati -d /usr/local/lib/mecab/dic/mecab-ipadic-neologd")
import pickle
with open('mecab.pkl', 'wb') as f:
    pickle.dump(wakati, f)

TypeError: cannot pickle 'SwigPyObject' object

### Custom taggerを作った [参考](https://tma15.github.io/blog/2020/11/22/pythonmecab%E3%81%AEtagger%E3%82%AA%E3%83%96%E3%82%B8%E3%82%A7%E3%82%AF%E3%83%88%E3%82%92%E6%8C%81%E3%81%A4%E5%8D%98%E8%AA%9E%E5%88%86%E5%89%B2%E5%99%A8%E3%82%92pickle%E3%81%A7%E4%BF%9D%E5%AD%98%E3%81%99%E3%82%8B%E6%96%B9%E6%B3%95/)

In [18]:
import MeCab
import pickle


class MyTagger:
    def __init__(self, option=''):
        self.option = option
        self.tagger = MeCab.Tagger(option)

    def __getstate__(self):
        return {'option': self.option}

    def __setstate__(self, state):
        for k, v in state.items():
            setattr(self, k, v)

    def __getnewargs__(self):
        return self.option,

    def __reduce_ex__(self, proto):
        func = MyTagger
        args = self.__getnewargs__()
        state = self.__getstate__()
        listitems = None
        dictitems = None
        rv = (func, args, state, listitems, dictitems)
        return rv

    def __call__(self, text):
        ret = self.tagger.parse(text).rstrip()
        return ret


if __name__ == '__main__':
    text = 'すもももももももものうち'

    with open('test.pickle', 'wb') as f:
        t = MyTagger("-O wakati -d /usr/local/lib/mecab/dic/mecab-ipadic-neologd")
        pickle.dump(t, f)
        print(t(text))


    with open('test.pickle', 'rb') as f:
        t2 = pickle.load(f)
        print(t2(text))

すもももももももものうち
すもももももももものうち


In [2]:
import os, sys
sys.path.append(os.pardir)
from preprocess.custom_mecab_tagger import CustomMeCabTagger

In [4]:
import pickle

with open('test.pickle', 'wb') as f:
        t = CustomMeCabTagger("-O wakati -d /usr/local/lib/mecab/dic/mecab-ipadic-neologd")
        pickle.dump(t, f)

In [10]:
t = CustomMeCabTagger("-O wakati -d /usr/local/lib/mecab/dic/mecab-ipadic-neologd")

In [13]:
t('私の名前は、青木です。')

'私 の 名前 は 、 青木 です 。'

## train.pyの作成

In [14]:
# -*- coding: utf-8 -*-
import pytorch_lightning as pl
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
from pytorch_lightning import loggers as pl_loggers
import pandas as pd
import glob
import argparse
from pathlib import Path
import sys, os
sys.path.append(os.pardir)
from model.HAN import HierAttnNet
from model.HANDataModule import CreateHANDataModule
from preprocess.tokenizer_HAN import HANtokenizer

In [15]:
train_df = pd.read_pickle('../model/data/nested/train.pkl')
valid_df = pd.read_pickle('../model/data/nested/valid.pkl')
test_df = pd.read_pickle('../model/data/nested/test.pkl')

In [16]:
checkpoints_dir = "./checkpoints/HAN"
log_dir = "./lightning_logs/HAN"
os.makedirs(checkpoints_dir, exist_ok=True)
os.makedirs(log_dir, exist_ok=True)

In [17]:
N_EPOCHS = 20
batch_size = 16
vocab_size=32000
word_hidden_dim=32
sent_hidden_dim=32
padding_idx=1
embed_dim=200

tokenizer = HANtokenizer(vocab_size=vocab_size)

pl.seed_everything(111)

Global seed set to 111


111

In [18]:
data_module = CreateHANDataModule(train_df, valid_df, test_df, batch_size=batch_size, tokenizer=tokenizer)

In [19]:
model = HierAttnNet(vocab_size=vocab_size, word_hidden_dim=word_hidden_dim, sent_hidden_dim=sent_hidden_dim, padding_idx=padding_idx,
                        embed_dim=embed_dim, embedding_matrix=tokenizer.embedding_matrix
                    )

In [20]:
early_stop_callback = EarlyStopping(
    monitor='val_loss',
    min_delta=0.005,
    patience=3,
    mode='min'
)

In [21]:
checkpoint_callback = ModelCheckpoint(
    dirpath=checkpoints_dir,
    filename='{epoch}',
    verbose=True,
    monitor='val_loss',
    mode='min'
)

In [9]:
tb_logger = pl_loggers.TensorBoardLogger(save_dir=log_dir)

In [22]:
trainer = pl.Trainer(max_epochs=N_EPOCHS,
                        gpus="6",
                        precision=16,
                        progress_bar_refresh_rate=10,
                        callbacks=[checkpoint_callback, early_stop_callback],
                        logger=tb_logger
)

  rank_zero_deprecation(
Using native 16bit precision.
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


In [23]:
trainer.fit(model=model, datamodule=data_module)

  rank_zero_warn(f"you defined a {step_name} but have no {loader_name}. Skipping {stage} loop")
A100-PCIE-40GB with CUDA capability sm_80 is not compatible with the current PyTorch installation.
The current PyTorch install supports CUDA capabilities sm_37 sm_50 sm_60 sm_70.
If you want to use the A100-PCIE-40GB GPU with PyTorch, please check the instructions at https://pytorch.org/get-started/locally/

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6]

  | Name         | Type             | Params
--------------------------------------------------
0 | wordattnnet  | WordAttnNet      | 6.4 M 
1 | sentattennet | SentAttnNet      | 23.0 K
2 | ld           | Dropout          | 0     
3 | fc           | Linear           | 130   
4 | criterion    | CrossEntropyLoss | 0     
--------------------------------------------------
6.5 M     Trainable params
0         Non-trainable params
6.5 M     Total params
25.889    Total estimated model params size (MB)


                                           

Global seed set to 111


Epoch 0:   0%|          | 0/233 [00:00<00:00, 987.36it/s]   



Epoch 0: 100%|██████████| 233/233 [08:47<00:00,  2.25s/it, loss=0.693, v_num=15]

RuntimeError: Early stopping conditioned on metric `val_loss` which is not available. Pass in or modify your `EarlyStopping` callback to use any of the following: ``

Epoch 0: 100%|██████████| 233/233 [09:04<00:00,  2.33s/it, loss=0.693, v_num=15]

In [30]:
print(dir(trainer))



In [None]:
trainer.test(ckpt_path=checkpoint_callback.best_model_path)

## 発話のduplicationを削除が上手くいっているか検証

In [1]:
a = ['こんにちは', 'こんにちは', 'hello']
a = list(set(a))
print(a)

['hello', 'こんにちは']


## HANの中身

In [2]:
import numpy as np
import torch
import torch.nn.functional as F

from torch import nn

### word atten net

In [27]:
class AttentionWithContext(nn.Module):
    def __init__(self, hidden_dim):
        super(AttentionWithContext, self).__init__()

        self.attn = nn.Linear(hidden_dim, hidden_dim)
        self.contx = nn.Linear(hidden_dim, 1, bias=False)

    def forward(self, inp):
        # The first expression in the attention mechanism is simply a linear layer that receives 
        # the output of the Word-GRU referred here as 'inp' and h_{it} in the paper
        u = torch.tanh_(self.attn(inp))
        # The second expression is...the same but without bias, wrapped up in a Softmax function
        a = F.softmax(self.contx(u), dim=1)
        # And finally, an element-wise multiplication taking advantage of Pytorch's broadcasting abilities 
        s = (a * inp).sum(1)
        # we will also return the normalized importance weights
        return a.permute(0, 2, 1), s

In [28]:
class Attention(nn.Module):
    def __init__(self, hidden_dim, seq_len):
        super(Attention, self).__init__()

        self.hidden_dim = hidden_dim
        self.seq_len = seq_len
        self.weight = nn.Parameter(nn.init.kaiming_normal_(torch.Tensor(hidden_dim, 1)))
        self.bias = nn.Parameter(torch.zeros(seq_len))

    def forward(self, inp):
        # 1. Matrix Multiplication
        x = inp.contiguous().view(-1, self.hidden_dim)
        u = torch.tanh_(torch.mm(x, self.weight).view(-1, self.seq_len) + self.bias)
        # 2. Softmax on 'u_{it}' directly
        a = F.softmax(u, dim=1)
        # 3. Braodcasting and out
        s = (inp * torch.unsqueeze(a, 2)).sum(1)
        return a, s

In [29]:
class WordAttnNet(nn.Module):
    def __init__(
        self,
        vocab_size,
        hidden_dim=32,
        padding_idx=1,
        embed_dim=50,
        embedding_matrix=None,
    ):
        super(WordAttnNet, self).__init__()

        if isinstance(embedding_matrix, np.ndarray):
            self.word_embed = nn.Embedding(
                vocab_size, embedding_matrix.shape[1], padding_idx=padding_idx
            )
            self.word_embed.weight = nn.Parameter(torch.Tensor(embedding_matrix))
            embed_dim = embedding_matrix.shape[1]
        else:
            self.word_embed = nn.Embedding(vocab_size, embed_dim, padding_idx=padding_idx)

        self.rnn = nn.GRU(embed_dim, hidden_dim, bidirectional=True, batch_first=True)

        self.word_attn = AttentionWithContext(hidden_dim * 2)

    def forward(self, X, h_n):
        embed = self.word_embed(X.long())
        h_t, h_n = self.rnn(embed, h_n)
        a, s = self.word_attn(h_t)
        return a, s.unsqueeze(1), h_n

In [4]:
bsz = 16
maxlen_sent = 20
hidden_dim  = 32
embed_dim   = 100
vocab_size  = 1000
padding_idx = 1

# net
word_embed = nn.Embedding(vocab_size, embed_dim, padding_idx=padding_idx)
rnn = nn.GRU(embed_dim, hidden_dim, bidirectional=True, batch_first=True)
attn = nn.Linear(hidden_dim*2, hidden_dim*2)
contx = nn.Linear(hidden_dim*2, 1, bias=False)

# inputs
X = torch.from_numpy(np.random.choice(vocab_size, (bsz, maxlen_sent)))
h_n = torch.zeros((2, bsz, hidden_dim))

In [6]:
X.shape

torch.Size([16, 20])

In [7]:
# 1. Word Embeddings
# (bsz, maxlen_sent, embed_dim)
embed = word_embed(X)
embed.shape # (bsz, maxlen_sent, emb_dim)

torch.Size([16, 20, 100])

In [9]:
# 2. GRU
h_t, h_n = rnn(embed, h_n)
# (bsz, seq_len, hidden_dim*2)
h_t.shape

torch.Size([16, 20, 64])

In [13]:
# 3. Attention
u = torch.tanh_(attn(h_t))
cu = contx(u)
a = F.softmax(cu, dim=1) # 全単語に対して重みを計算
print(h_t.shape, u.shape, cu.shape, a.shape)

torch.Size([16, 20, 64]) torch.Size([16, 20, 64]) torch.Size([16, 20, 1]) torch.Size([16, 20, 1])


In [14]:
contx.weight.shape

torch.Size([1, 64])

In [12]:
# RNN outputs scaled by their importance weights
s = (a * h_t) # 単語の隠れ状態の重み付き和が文のベクトルsになる。
print(s.shape)
# Sum along the seq dim so we end up with a representation per document/review
s = s.sum(1)
print(s.shape)
# Because this will be stack for all sentences, we do the `.unsqueeze(1)`
print(s.unsqueeze(1).shape)

torch.Size([16, 20, 64])
torch.Size([16, 64])
torch.Size([16, 1, 64])


In [18]:
f = torch.ones(2,3)
print(f)
print(f.unsqueeze(1))

tensor([[1., 1., 1.],
        [1., 1., 1.]])
tensor([[[1., 1., 1.]],

        [[1., 1., 1.]]])


### sent attn net

In [31]:
class SentAttnNet(nn.Module):
    def __init__(
        self, word_hidden_dim=32, sent_hidden_dim=32, padding_idx=1
    ):
        super(SentAttnNet, self).__init__()

        self.rnn = nn.GRU(
            word_hidden_dim * 2, sent_hidden_dim, bidirectional=True, batch_first=True
        )

        self.sent_attn = AttentionWithContext(sent_hidden_dim * 2)

    def forward(self, X):
        h_t, h_n = self.rnn(X)
        a, v = self.sent_attn(h_t)
        return a.permute(0,2,1), v

### han

In [32]:
class HierAttnNet(nn.Module):
    def __init__(
        self,
        vocab_size,
        maxlen_sent,
        maxlen_doc,
        word_hidden_dim=32,
        sent_hidden_dim=32,
        padding_idx=1,
        embed_dim=50,
        embedding_matrix=None,
        num_class=4,
    ):
        super(HierAttnNet, self).__init__()

        self.word_hidden_dim = word_hidden_dim

        self.wordattnnet = WordAttnNet(
            vocab_size=vocab_size,
            hidden_dim=word_hidden_dim,
            padding_idx=padding_idx,
            embed_dim=embed_dim,
            embedding_matrix=embedding_matrix,
        )

        self.sentattnnet = SentAttnNet(
            word_hidden_dim=word_hidden_dim,
            sent_hidden_dim=sent_hidden_dim,
            padding_idx=padding_idx,
        )

        self.fc = nn.Linear(sent_hidden_dim * 2, num_class)

    def forward(self, X):
        x = X.permute(1, 0, 2)
        word_h_n = nn.init.zeros_(torch.Tensor(2, X.shape[0], self.word_hidden_dim))
        if use_cuda:
            word_h_n = word_h_n.cuda()
        # alpha and s Tensor Lists
        word_a_list, word_s_list = [], []
        for sent in x:
            word_a, word_s, word_h_n = self.wordattnnet(sent, word_h_n)
            word_a_list.append(word_a)
            word_s_list.append(word_s)
        # Importance attention weights per word in sentence
        self.sent_a = torch.cat(word_a_list, 1)
        # Sentences representation
        sent_s = torch.cat(word_s_list, 1)
        # Importance attention weights per sentence in doc and document representation
        self.doc_a, doc_s = self.sentattnnet(sent_s)
        return self.fc(doc_s)

In [5]:
maxlen_sent = 20
maxlen_doc = 5
num_class = 4
word_hidden_dim = 32
sent_hidden_dim = 32

wordattnnet = WordAttnNet(vocab_size, hidden_dim, padding_idx, embed_dim, embedding_matrix=None)
sentattnnet = SentAttnNet(word_hidden_dim, sent_hidden_dim, padding_idx)
fc = nn.Linear(sent_hidden_dim * 2, num_class)

NameError: name 'WordAttnNet' is not defined

In [6]:
X = torch.from_numpy(np.random.choice(vocab_size, (bsz, maxlen_doc, maxlen_sent)))

In [20]:
a = [961, 626, 368, 545,  76,  57, 609, 835, 773, 544, 776, 623,  80, 758, 617, 527, 581, 592, 432, 445]
b = torch.tensor(a)
print(b)

tensor([961, 626, 368, 545,  76,  57, 609, 835, 773, 544, 776, 623,  80, 758,
        617, 527, 581, 592, 432, 445])


In [18]:
print(X[0])

tensor([[961, 626, 368, 545,  76,  57, 609, 835, 773, 544, 776, 623,  80, 758,
         617, 527, 581, 592, 432, 445],
        [ 21, 603,  33, 201, 136, 819, 933, 178, 610,  43,  64, 564, 570, 384,
         784, 942, 967, 182, 774, 675],
        [113, 722,  25, 805, 676, 237, 139, 820, 259, 500, 821, 932, 902, 542,
         783, 895, 155, 973, 737, 699],
        [324, 480, 342, 372, 205, 772, 362, 428, 917, 910, 135, 492, 480, 998,
         258, 955, 418, 535, 289, 652],
        [995, 440, 408, 548,  82, 389, 481, 906, 925, 966, 809, 215, 165, 421,
         943, 529, 972, 156, 248, 580]])


In [23]:
x = X.permute(1, 0, 2)
x.shape

torch.Size([5, 16, 20])

In [24]:
# Initial Word RNN hidden state
word_h_n = nn.init.zeros_(torch.Tensor(2, X.shape[0], word_hidden_dim))

In [26]:
# Loop through sentences:
word_a_list, word_s_list = [], []
for sent in x:
    print(sent.shape)
    word_a, word_s, word_h_n = wordattnnet(sent, word_h_n)
    word_a_list.append(word_a)
    word_s_list.append(word_s)
# Importance attention weights per word in sentence
sent_a = torch.cat(word_a_list, 1)
# Sentences representation
sent_s = torch.cat(word_s_list, 1)
# (bsz, maxlen_doc, maxlen_sent)
print(sent_a.shape)
# (bsz, maxlen_doc, hidden_dim*2)
print(sent_s.shape)

torch.Size([16, 20])
torch.Size([16, 20])
torch.Size([16, 20])
torch.Size([16, 20])
torch.Size([16, 20])
torch.Size([16, 5, 20])
torch.Size([16, 5, 64])


In [27]:
doc_a, doc_s = sentattnnet(sent_s)
# (bsz, maxlen_doc, 1). One could .squeeze(2)
print(doc_a.shape)
# (bsz, hidden_dim*2)
print(doc_s.shape) # docの隠れ状態

torch.Size([16, 5, 1])
torch.Size([16, 64])


In [28]:
out = fc(doc_s) # (bsz, class_num)
out.shape

torch.Size([16, 4])

## DataModule.pyが正しく動いているかチェック
⇒同時に、HANに入れる前にどうしていたかもチェック
⇒ちゃんとできてそうだった！

In [5]:
import pytorch_lightning as pl
from pytorch_lightning.accelerators import accelerator
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
from pytorch_lightning import loggers as pl_loggers
from pytorch_lightning.accelerators.gpu import GPUAccelerator
from pytorch_lightning.plugins import NativeMixedPrecisionPlugin

In [8]:
import os, sys
sys.path.append(os.pardir)
from model.HANDataModule import CreateHANDataModule
from preprocess.tokenizer_HAN import HANtokenizer

In [3]:
import pandas as pd

train_df = pd.read_pickle('../model/data/nested/train.pkl')
valid_df = pd.read_pickle('../model/data/nested/valid.pkl')
test_df = pd.read_pickle('../model/data/nested/test.pkl')

In [9]:
pl.seed_everything(111)
data_module = CreateHANDataModule(train_df, valid_df, test_df, batch_size=5, tokenizer=HANtokenizer())


Global seed set to 111


In [10]:
data_module.setup(stage='test')

In [15]:
test_ds = data_module.test_ds

In [16]:
len(test_ds.__getitem__(0).get('nested_utters'))

180

In [17]:
from torch.utils.data import DataLoader
_test_dl = DataLoader(dataset=test_ds, batch_size=5, shuffle=False, pin_memory=True)

In [20]:
print(len(_test_dl))

94


In [26]:
x,y = next(iter(_test_dl)).values()
print(x)

tensor([[[  25,    8,    5,  ...,    1,    1,    1],
         [ 753,    5,  101,  ...,    1,    1,    1],
         [1173,    5,  120,  ...,    1,    1,    1],
         ...,
         [   1,    1,    1,  ...,    1,    1,    1],
         [   1,    1,    1,  ...,    1,    1,    1],
         [   1,    1,    1,  ...,    1,    1,    1]],

        [[ 162,   82,   56,  ...,    1,    1,    1],
         [   8,  297,   58,  ...,    1,    1,    1],
         [   8,    9, 7743,  ...,    1,    1,    1],
         ...,
         [   1,    1,    1,  ...,    1,    1,    1],
         [   1,    1,    1,  ...,    1,    1,    1],
         [   1,    1,    1,  ...,    1,    1,    1]],

        [[ 775,    8,    9,  ...,    1,    1,    1],
         [ 163,    5,  940,  ...,    1,    1,    1],
         [ 220,  220,    5,  ...,    1,    1,    1],
         ...,
         [   1,    1,    1,  ...,    1,    1,    1],
         [   1,    1,    1,  ...,    1,    1,    1],
         [   1,    1,    1,  ...,    1,    1,    1]],

In [11]:
test_dl = data_module.test_dataloader()

A100-PCIE-40GB with CUDA capability sm_80 is not compatible with the current PyTorch installation.
The current PyTorch install supports CUDA capabilities sm_37 sm_50 sm_60 sm_70.
If you want to use the A100-PCIE-40GB GPU with PyTorch, please check the instructions at https://pytorch.org/get-started/locally/



In [12]:
len(test_dl)

2

In [13]:
test_batch = next(iter(test_dl))

In [14]:
print(test_batch)

nested_utters


## pandasを思ったように使えているか実験(preprocess_HAN.py, HANDataModule.py)

In [4]:
padding = [[1 for _ in range(10)] for _ in range(2)]
print(padding)

[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]


In [19]:
import pandas as pd
import os, sys
sys.path.append(os.pardir)
from preprocess.tokenizer_HAN import HANtokenizer

index = 0

df = pd.read_pickle('../model/data/nested/test.pkl')
df_row = df.loc[:,'nested_utters'].iloc[index]['raw_nested_utters']
label = df.loc[:,'label'].iloc[index]

preproceser = HANtokenizer()
print(df_row.head())
print(label)

0                                    …<person>、なのか？（汗）
1    っと、村の決め方についてと吊りが<person>ってことは、了解した。吊りはもうセットしたぜ...
2                       えーと、狩人はＣＯした方が良いんじゃね？誰を守ったかってな。
3    …午前１時過ぎに言おうって決めてたんだけどよ、ＰＣ出来なくなっちまったんだよ。（汗）怪しく思...
4    …<person>、<person>？<person>は『<person>は、人狼だったのじ...
Name: raw_nested_utters, dtype: object
1


In [20]:
padded_nested_utters = preproceser.encode(df_row)

In [None]:
print(padded_nested_utters)

[tensor([  116,   150,     5,  2365,    55,    20,     4,    69,   386,     5,
        11792,     7,    25,  2043,    16,    79,     8,     5,   271,    10,
         1574,    46,    18,    15,     5,  2522,    11,     7,   587,    10,
         1506,    15,    19,  7415,    56,   127,    16,    31,     8,     5,
         2037,    15,     7,   908,    32,    80,    10,   360,    48,   145,
           22,   155,   150,     5,  1291,     9,  2016,    20,    66,    70,
           20,     6,     9,   608,    16,   232,    25,     4,   228,     5,
           93,     6,   287,    11,  4393,    90,  1276,    10,   157,    22,
            4,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,

In [None]:
ones = '1 '*10
print(list(map(int, ones.split())))

In [None]:
import torch
padded_nested_utters.head()

In [None]:
padded_nested_utters[0][0][0][0]

In [3]:
df_row_tokenized = df_row.apply(preproceser.tokenize)

In [4]:
df_row_numericalized = df_row_tokenized.apply(preproceser.numericalize)

In [None]:
print(type(df_row_numericalized[0][0]))

In [5]:
word_padded_df = df_row_numericalized.apply(preproceser.padding_word_level)

In [6]:
import torch
word_padded = [torch.tensor(utter) for utter in word_padded_df.to_list()]

In [7]:
padding = [torch.ones_like(word_padded[0]) for _ in range(20)]
print(len(word_padded))
padded = word_padded + padding
print(len(padded))

104
124


In [7]:
padded_nested_utters = preproceser.padding_sent_level(word_padded)

In [8]:
a = torch.tensor(padded_nested_utters)

TypeError: only integer tensors of a single element can be converted to an index

In [None]:
padded_nested_utters[0][0]

pandasの実験（くっ付ける）

In [None]:
import pandas as pd

a = [1,2,3,4,5,6,7,8]
b = [9,10,11,12,13]

a_df = pd.DataFrame({'utter': a})
b_df = pd.DataFrame({'utter': b})

c = pd.concat([a_df, b_df], axis=0)
print(c)

tokenize⇒numericalize⇒embeddig matrix作成までの実験(preprocecc_HAN)のため。

In [None]:
from torchtext.vocab import Vocab
from collections import Counter, OrderedDict
from typing import Dict, List, Optional, Iterable
from torchtext._torchtext import (
    Vocab as VocabPybind,
)

def vocab_with_vocab_size(ordered_dict: Dict, min_freq: int = 1, vocab_size: int = 32000) -> Vocab:
    r"""Factory method for creating a vocab object which maps tokens to indices.

    Note that the ordering in which key value pairs were inserted in the `ordered_dict` will be respected when building the vocab.
    Therefore if sorting by token frequency is important to the user, the `ordered_dict` should be created in a way to reflect this.

    Args:
        ordered_dict: Ordered Dictionary mapping tokens to their corresponding occurance frequencies.
        min_freq: The minimum frequency needed to include a token in the vocabulary.

    Returns:
        torchtext.vocab.Vocab: A `Vocab` object

    Examples:
        >>> from torchtext.vocab import vocab
        >>> from collections import Counter, OrderedDict
        >>> counter = Counter(["a", "a", "b", "b", "b"])
        >>> sorted_by_freq_tuples = sorted(counter.items(), key=lambda x: x[1], reverse=True)
        >>> ordered_dict = OrderedDict(sorted_by_freq_tuples)
        >>> v1 = vocab(ordered_dict)
        >>> print(v1['a']) #prints 1
        >>> print(v1['out of vocab']) #raise RuntimeError since default index is not set
        >>> tokens = ['e', 'd', 'c', 'b', 'a']
        >>> v2 = vocab(OrderedDict([(token, 1) for token in tokens]))
        >>> #adding <unk> token and default index
        >>> unk_token = '<unk>'
        >>> default_index = -1
        >>> if unk_token not in v2: v2.insert_token(unk_token, 0)
        >>> v2.set_default_index(default_index)
        >>> print(v2['<unk>']) #prints 0
        >>> print(v2['out of vocab']) #prints -1
        >>> #make default index same as index of unk_token
        >>> v2.set_default_index(v2[unk_token])
        >>> v2['out of vocab'] is v2[unk_token] #prints True
    """

    tokens = []
    for token, freq in ordered_dict.items():
        if freq >= min_freq:
            tokens.append(token)
        if len(tokens) > vocab_size:
            break

    return Vocab(VocabPybind(tokens, None))


def build_vocab_from_iterator_with_vocab_size(iterator: Iterable, min_freq: int = 1, vocab_size: int = 32000, specials: Optional[List[str]] = None, special_first: bool = True) -> Vocab:
    """
    Build a Vocab from an iterator.

    Args:
        iterator: Iterator used to build Vocab. Must yield list or iterator of tokens.
        min_freq: The minimum frequency needed to include a token in the vocabulary.
        specials: Special symbols to add. The order of supplied tokens will be preserved.
        special_first: Indicates whether to insert symbols at the beginning or at the end.


    Returns:
        torchtext.vocab.Vocab: A `Vocab` object

    Examples:
        >>> #generating vocab from text file
        >>> import io
        >>> from torchtext.vocab import build_vocab_from_iterator
        >>> def yield_tokens(file_path):
        >>>     with io.open(file_path, encoding = 'utf-8') as f:
        >>>         for line in f:
        >>>             yield line.strip().split()
        >>> vocab = build_vocab_from_iterator(yield_tokens_batch(file_path), specials=["<unk>"])
    """

    counter = Counter()
    for tokens in iterator:
        counter.update(tokens)

    if specials is not None:
        for tok in specials:
            del counter[tok]

    sorted_by_freq_tuples = sorted(counter.items(), key=lambda x: x[0])
    sorted_by_freq_tuples.sort(key=lambda x: x[1], reverse=True)
    ordered_dict = OrderedDict(sorted_by_freq_tuples)

    if specials is not None:
        if special_first:
            specials = specials[::-1]
        for symbol in specials:
            ordered_dict.update({symbol: min_freq})
            ordered_dict.move_to_end(symbol, last=not special_first)

    word_vocab = vocab_with_vocab_size(ordered_dict, min_freq=min_freq, vocab_size=vocab_size + len(specials))
    return word_vocab

In [None]:
from collections import Counter
from torchtext.vocab import build_vocab_from_iterator
import MeCab

wakati = MeCab.Tagger("-O wakati -d /usr/local/lib/mecab/dic/mecab-ipadic-neologd")

def yield_tokens():
    with open('../tokenizer/split_train.txt', 'r') as f:
        for line in f:
            yield line.split()
specials=['<unk>', '<PAD>', '<BOS>', '<EOS>']
vocab = build_vocab_from_iterator_with_vocab_size(yield_tokens(), min_freq=5, specials=specials)

In [None]:
vocab.lookup_token(0)

In [None]:
from torchtext.vocab import Vectors

vectors = Vectors(name='model_fasttext.vec',
                        cache='../tokenizer/dim_200/'
                        )

In [None]:
import torch
vocab_size = 32000
special_tokens_matrix = torch.zeros(len(specials), 200)

In [None]:
stoi = vocab.get_stoi()
sorted_stoi = dict(sorted(stoi.items(), key=lambda x: x[1]))

In [None]:
other_tokens_matrix = vectors.get_vecs_by_tokens(list(sorted_stoi.keys())[len(specials):])

In [None]:
embedding_matrix = torch.cat((special_tokens_matrix ,other_tokens_matrix), dim=0)

In [None]:
import MeCab

wakati = MeCab.Tagger("-O wakati -d /usr/local/lib/mecab/dic/mecab-ipadic-neologd")
line = wakati.parse("私。<person>さんは【人間】だったわ。でも、<person>さんが危惧していた通りになってしまったわね・・・ご冥福をお祈り致しますわ。私も今日は少し所用があるので、本格的に顔を出せるのは夕刻以降になりそうですわ。それでは、また。").split()
print(line)
indices = vocab.lookup_indices(line)
print(indices)

In [None]:
embedding_matrix.shape

形態素解析についての実験

In [None]:
import pandas as pd

valid = pd.read_pickle('../model/data/nested/valid.pickle')

print(valid.head())

In [None]:
print(valid.loc[:,'utters'].iloc[0].loc[:,'parsed_utters'].head())

In [None]:
print(valid.loc[:,'labels'].iloc[0])

In [None]:
print(valid['utters'][0]['parsed_utters'][0].head())

In [None]:
import pandas as pd

texts1 = ['あああ', 'いいい', 'ううう']
texts2 = ['えええ', 'おおお', 'かかか']
utterances1 = pd.DataFrame({'texts1':texts1})
print(utterances1['texts1'].to_list())
len(utterances1)

In [None]:
import MeCab
wakati = MeCab.Tagger("-O wakati -d /usr/local/lib/mecab/dic/mecab-ipadic-neologd")

line = wakati.parse("私。<person>さんは【人間】だったわ。<br>でも、レジーナさんが危惧していた通りになってしまったわね・・・ご冥福をお祈り致しますわ。私も今日は少し所用があるので、本格的に顔を出せるのは夕刻以降になりそうですわ。それでは、また。")
print(line.split())

In [None]:
import pickle
from sklearn.model_selection import StratifiedKFold

with open('../model/data/train.pickle','rb') as f:
    train = pickle.load(f)

print(f"{train['num_utters'].mean():,.2f}")

print(train['utters'][0].head())

"""
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
for fold_id, (train_idx, valid_index) in enumerate(cv.split(train['utters'], train['labels'])):
    X_tr = train['utters'][train_idx]
    X_val = train['utters'][valid_index]
    y_tr = train['labels'][train_idx]
    y_val = train['labels'][valid_index]
    print(fold_id)
    print(X_tr)
    print(y_tr)
"""

In [None]:
import pandas as pd
import pickle

texts1 = ['あああ', 'いいい', 'ううう']
texts2 = ['えええ', 'おおお', 'かかか']
utterances1 = pd.DataFrame({'texts':texts1})
utterances2 = pd.DataFrame({'texts':texts2})
utterances = [utterances1, utterances2]
labels = [0, 1]

df = pd.DataFrame({'utters':utterances, 'labels': labels})

for t in df.get('utters'):
    t['length'] = len(t['texts'])

print(df)

""" train = []

for utters, label in zip(utterances, labels):
    train.append({'utterances': utters, 'labels': label})

for t in train:
    t.get('utterances')['length'] = len(t.get('utterances'))

with open('train.pickle', 'bw') as f:
    pickle.dump(train, f, protocol=5)

with open('train.pickle', 'br') as f:
    trainp = pickle.load(f)

print(trainp) """

In [None]:
import torchmetrics
import torch
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns


def cm(preds, labels):
    cm = torchmetrics.ConfusionMatrix(num_classes=2)

    df_cm = pd.DataFrame(cm(preds, labels).numpy())
    print(df_cm.to_string())

    plt.figure(figsize=(2,2))
    fig = sns.heatmap(df_cm, annot=True, cmap='Spectral').get_figure()
    plt.show()
preds = torch.tensor([0,1,1,1])
labels = torch.tensor([0,1,0,1])

cm(preds, labels)



In [None]:
from torchmetrics import AveragePrecision, F1
# from torchmetrics import BinnedAveragePrecision
import torch
from sklearn.metrics import precision_recall_fscore_support
import numpy as np

pred = torch.tensor([0,1,1,1,0])
label = torch.tensor([0,1,0,1,1])

# 2/3 = 0.66...
average_precision = AveragePrecision(pos_label=1, num_classes=2)
# bined_precision = BinnedAveragePrecision(num_classes=1)
f1 = F1(num_classes=2)
f1_skl, prec, recall,_ = precision_recall_fscore_support(label, pred, average="micro")
print(f"average precision:{average_precision(pred, label)}")
# print(f"bined precision{bined_precision(pred, label)}")
print(f"torch metrics{f1(pred, label)}")
print(f"skl f1: {f1_skl}, prec:{prec}, recall:{recall}")
cm(pred, label)
scores_df = pd.DataFrame(np.array(precision_recall_fscore_support(preds, preds)).T,
                                    columns=["precision", "recall", "f1", "support"],
                                )
fig, ax = plt.subplots(figsize=(2,2))
ax.axis('off')
ax.axis('tight')
ax.table(cellText=scores_df.values,
         colLabels=scores_df.columns,
         loc='center')
plt.show()

In [None]:
import torch
p=torch.Tensor([[0.9, 0.1], [0.4, 0.6]])
print(p.argmax(dim=1))

In [None]:
import pandas as pd

test = pd.read_pickle('../model/data/flat/balance/test.pkl')

lie = test.query("label==1")
lie['lie'] = False
lie['level'] = ''
lie['reason'] = ''
lie = lie[:5000]
lie.to_csv('lie1.csv', index=False)

In [None]:
import torch
from transformers import BertJapaneseTokenizer
import os, sys
sys.path.append(os.pardir)
from model.BERT import Classifier
pretrained_model_path='cl-tohoku/bert-base-japanese'
tokenizer = BertJapaneseTokenizer.from_pretrained(pretrained_model_path)

#model = Classifier.load_from_checkpoint('/home/lyriatest/haoki/Documents/vscode-workplaces/lie-detector/model/checkpoints/balance/epoch=3.ckpt', n_classes=2)
tokenizer.decode
def tokenize(text):
    encoding = tokenizer.encode_plus(
                text,
                add_special_tokens=True,
                max_length=512,
                padding="max_length",
                truncation=True,
                return_attention_mask=True,
                return_tensors='pt',
            )
    return encoding['input_ids'].flatten(), encoding['attention_mask'].flatten()
   
text1 = 'うん、何か白アピしてみたけど全然神父白くないな。<br>神父狼あるな！！<br><br>そんなわけで私はパメラ人だと思うので、じいさんは生きている限り、<br>羊→旅→宿or神の順に焼き払ってくれたらいいかと。<br>宿と旅を見比べたけど、襲撃筋考えると旅狼の方が濃い。だから旅先吊り。パメラは襲撃されなかったら考える。'
text2 = ''
input_ids = []
atten_mask = []
for t in [text1, text2]:
    print(t, len(t))
    ids = tokenizer.encode(t, add_special_tokens=False)
    print(ids, len(ids))
    decode = tokenizer.decode(ids, skip_special_tokens=True)
    print(decode, len(decode.split(' ')))
    print()

# print(torch.stack(ex, dim=1))
# a = encoding['input_ids']
# print(a.size())

# print(model(torch.stack(input_ids, dim=0), torch.stack(atten_mask, dim=0)))

In [None]:
import json
a = [1,2,3]

with open('./pra.txt', 'w') as f:
    for row in a:
        f.write(f"{row}\n")
        print("yay", file=f)

In [None]:
from transformers import BertJapaneseTokenizer
import pandas as pd
import sys, os
sys.path.append(os.pardir)
from utils.cal_stats import cal_stats

pretrained_model_path='cl-tohoku/bert-base-japanese'
tokenizer = BertJapaneseTokenizer.from_pretrained(pretrained_model_path)

train_df = pd.read_pickle('/home/lyriatest/haoki/Documents/vscode-workplaces/lie-detector/model/data/flat/aaa/balance/train.pkl')
valid_df = pd.read_pickle('/home/lyriatest/haoki/Documents/vscode-workplaces/lie-detector/model/data/flat/aaa/balance/valid.pkl')
test_df = pd.read_pickle('/home/lyriatest/haoki/Documents/vscode-workplaces/lie-detector/model/data/flat/aaa/balance/test.pkl')

for name, df in [('train', train_df), ('valid', valid_df), ('test', test_df)]:
    text = df['text'].tolist()
    label = df['label'].tolist()

    stats, sents_tokens_to_check = cal_stats(text, label, tokenizer)

    with open(f'./stats_{name}.txt', 'w') as f:
        f.write(stats)

    with open(f'sents_tokens_{name}.txt', 'w') as f:
        for l in sents_tokens_to_check:
            f.write(l)

In [None]:
from transformers import BertJapaneseTokenizer
import pandas as pd

pretrained_model_path='cl-tohoku/bert-base-japanese'
tokenizer = BertJapaneseTokenizer.from_pretrained(pretrained_model_path)

valid_df = pd.read_pickle('/home/lyriatest/haoki/Documents/vscode-workplaces/lie-detector/model/data/flat/balance/train.pkl')
sentences = valid_df['text'].tolist()
labels = valid_df['label'].tolist()
len_count = {}

for sentence, _ in zip(sentences, labels):
    tokens = tokenizer.tokenize(sentence)
    sent_len = len(tokens)
    len_count[sent_len] = len_count.get(sent_len, 0) + 1

In [None]:
len_count = sorted(len_count.items(), key=lambda x:x[0])
len_count