In [1]:
import numpy as np 
from janome.tokenizer import Tokenizer
from gensim.models.keyedvectors import KeyedVectors
import torch
from torch import nn, optim
import pickle
from sys import argv
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence

In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else 'cpu')

In [3]:
tkz = Tokenizer()
s = '私は犬が好き。'
ws = [w for w in tkz.tokenize(s, wakati=True)]

In [4]:
ws

['私', 'は', '犬', 'が', '好き', '。']

In [5]:
w2v = KeyedVectors.load_word2vec_format('./hidden_files/entity_vector.model.bin', binary=True)

In [6]:
xn = torch.tensor([w2v[w] for w in np.array(ws)])
print(xn.shape)

# LSTMの入力のためにバッチ化
xn = xn.unsqueeze(0)

print(xn.shape)

torch.Size([6, 200])
torch.Size([1, 6, 200])


  xn = torch.tensor([w2v[w] for w in np.array(ws)])


In [7]:
# LSTM

lstm = nn.LSTM(200, 200, batch_first=True)
h0 = torch.randn(1, 1, 200)
c0 = torch.randn(1, 1, 200)
yn, (hn, cn) = lstm(xn, (h0, c0))
print(yn.shape)
print(hn.shape)
print(cn.shape)

torch.Size([1, 6, 200])
torch.Size([1, 1, 200])
torch.Size([1, 1, 200])


In [8]:
with open('./hidden_files/LSTM/dic.pkl', 'br') as f:
    dic = pickle.load(f)

for i in dic.items():
    print(dict([i]))
    break

{'万能': 1}


In [9]:
# 訓練データの確認

with open('./hidden_files/LSTM/xtrain.pkl', 'br') as f:
    xdata = pickle.load(f)

with open('./hidden_files/LSTM/ytrain.pkl', 'br') as f:
    ydata = pickle.load(f)

with open('./hidden_files/LSTM/label.pkl', 'br') as f:
    labels = pickle.load(f)

In [10]:
print('訓練データのバッチ６番目は：',xdata[6])
print('正解データのバッチ６番目は：',ydata[6])

訓練データのバッチ６番目は： [74, 75, 2, 60, 76, 62, 5, 6]
正解データのバッチ６番目は： [9, 0, 1, 8, 5, 7, 3, 4]


In [11]:
class MyLSTM(nn.Module):
    def __init__(self, voccsize, posn, hdim):
        super().__init__()
        self.embed = nn.Embedding(voccsize, hdim)
        self.lstm = nn.LSTM(hdim, hdim, batch_first=True)
        self.ln = nn.Linear(hdim, posn)

    def forward(self, x):
        ex = self.embed(x)
        lo = self.lstm(ex)
        out = self.ln(lo[0])
        return out
        

In [12]:
net = MyLSTM(len(dic)+1, len(labels), 100).to(device) # dic:word2idに0を除いているので+1
optimizer = optim.SGD(net.parameters(), lr=0.01)
criterion = nn.CrossEntropyLoss()

In [13]:
dic

{'万能': 1,
 'で': 2,
 'は': 3,
 'ない': 4,
 'です': 5,
 '。': 6,
 'も': 7,
 '、': 8,
 '他': 9,
 'の': 10,
 '書籍': 11,
 '勉強': 12,
 'し': 13,
 'て': 14,
 'い': 15,
 '行き詰っ': 16,
 'た': 17,
 'とき': 18,
 'に': 19,
 'すごく': 20,
 'コンパクト': 21,
 'まとまっ': 22,
 'いる': 23,
 '便利': 24,
 'これ': 25,
 '一': 26,
 '冊': 27,
 'なん': 28,
 'と': 29,
 'か': 30,
 'しよう': 31,
 'する': 32,
 '危険': 33,
 'が': 34,
 '独学': 35,
 '暗中': 36,
 '模索': 37,
 'な': 38,
 '人': 39,
 'けっこう': 40,
 '役': 41,
 '立つ': 42,
 '思い': 43,
 'ます': 44,
 'SketchUp': 45,
 '本': 46,
 'なかなか': 47,
 '出': 48,
 '探し': 49,
 'まし': 50,
 '基本': 51,
 '操作': 52,
 'ほとんど': 53,
 '網羅': 54,
 '初心': 55,
 '者': 56,
 '特に': 57,
 '建築': 58,
 '系': 59,
 'とても': 60,
 '使い': 61,
 'やすい': 62,
 'ある': 63,
 '程度': 64,
 '判っ': 65,
 '機能': 66,
 '別': 67,
 '構成': 68,
 'さ': 69,
 'れ': 70,
 'リファレンス': 71,
 '的': 72,
 '使え': 73,
 'フル': 74,
 'カラー': 75,
 '見': 76,
 'セイ': 77,
 'リング': 78,
 '中': 79,
 '万一': 80,
 'こと': 81,
 'あっ': 82,
 '参考': 83,
 'なる': 84,
 '思っ': 85,
 '読ん': 86,
 'だ': 87,
 '「': 88,
 'のど': 89,
 '渇い': 90,
 '最後': 91,
 '水': 92,
 

In [14]:
for ep in range(10):
    losslk = 0.0
    for i in range(len(xdata)):
        x = xdata[i]
        x = torch.LongTensor(x).to(device)
        output = net(x)
        y = torch.LongTensor(ydata[i]).to(device)
        loss = criterion(output, y)
        if(i % 1000 == 0):
            print(i, losslk)
            losslk = loss.item() なのか？# なぜ+=ではなく=なのか？
        else:
            losslk += loss.item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    outfile = "lstm0" + str(ep) + ".model"
    torch.save(net.state_dict(), outfile)

SyntaxError: invalid character '？' (U+FF1F) (63677825.py, line 11)

## LSTMの推論

In [15]:
net = MyLSTM(len(dic)+1, len(labels), 100).to(device)
net.load_state_dict(torch.load('./lstm09.model'))

<All keys matched successfully>

In [16]:
with open('./hidden_files/LSTM/xtest.pkl', 'br') as f:
    xtest = pickle.load(f)
    
with open('./hidden_files/LSTM/ytest.pkl', 'br') as f:
    ytest = pickle.load(f)


In [17]:
real_data_num = 0

net.eval()
with torch.no_grad():
    ok = 0
    for i in range(len(xtest)):
        real_data_num += len(xtest[i])
        x = [xtest[i]]
        x = torch.LongTensor(x).to(device)
        output = net(x)
        pred = torch.argmax(output[0], dim=1)
        y = torch.LongTensor(ytest[i]).to(device)
        ok += torch.sum(pred == y).item()

    print(ok, real_data_num, ok/real_data_num)

26211 27271 0.9611308716218694


## LSTMのバッチ処理

In [18]:
class MyDataset(Dataset):
    def __init__(self, xdata, ydata):
        self.data = xdata
        self.label = ydata
    # len()で取得
    def __len__(self):
        return len(self.label)

    # dataset[1]なでど[]でアクセスできる
    def __getitem__(self, idx):
        x = self.data[idx]
        y = self.label[idx]
        return x, y

# データローダーでバッチを作成するためのカスタム collate 関数
def my_collate_fn(batch):
    xdata, ydata = list(zip(*batch))
    xs = list(xdata)
    ys = list(ydata)
    return xs, ys

with open('./hidden_files/LSTM/xtrain.pkl', 'br') as fr:
    xdata = pickle.load(fr)
    
with open('./hidden_files/LSTM/ytrain.pkl', 'br') as fr:
    ydata = pickle.load(fr)

batch_size = 200
dataset = MyDataset(xdata, ydata)
dataloader = DataLoader(dataset, batch_size = batch_size, shuffle=True, collate_fn=my_collate_fn)

In [19]:
# dataloaderの挙動チェック
dataloader = DataLoader(dataset, batch_size = 3, shuffle=True, collate_fn=my_collate_fn)
d1 = dataloader.__iter__()

print(f'全データ数：{len(dataset)}')
print(f'１バッチのデータ数{len(d1)}')

xs, ys = next(d1)
print('-'*30)
print(xs)
print(ys)

全データ数：50000
１バッチのデータ数16667
------------------------------
[[5349, 14, 23, 10, 3, 327, 164, 5, 186, 6], [240, 8, 2511, 2512, 2, 1746, 8, 2681, 7, 19225, 10, 313, 1799, 34, 8352, 8, 23022, 154, 155, 156, 17, 6], [261, 12200, 10, 1737, 34, 14847, 81, 3, 8, 1972, 7, 3767, 14, 15, 154, 155, 264, 775, 2252, 1737, 398, 4, 2472, 10, 26, 653, 3, 8, 1694, 10, 1687, 125, 34, 12375, 38, 1200, 19, 2749, 13, 17, 318, 2, 193, 8, 23541, 38, 1200, 19, 3, 3296, 398, 4, 81, 156, 17, 6]]
[[5, 1, 5, 1, 1, 0, 1, 3, 1, 4], [11, 4, 0, 0, 3, 2, 4, 0, 1, 0, 1, 2, 0, 1, 0, 4, 5, 3, 3, 3, 3, 4], [4, 0, 1, 0, 1, 2, 0, 1, 4, 0, 1, 5, 1, 5, 3, 3, 4, 15, 2, 0, 5, 3, 0, 1, 0, 7, 1, 4, 0, 1, 5, 7, 1, 9, 3, 0, 1, 0, 5, 3, 0, 3, 5, 4, 9, 3, 0, 1, 1, 0, 5, 3, 0, 3, 3, 4]]


## Point
___
・各データのリスト長がバラバラなので単純にxs, xyから配列を作成することはできない。  
・Paddingで長さを揃える


In [20]:
a = torch.LongTensor([10, 20, 30])
b = torch.LongTensor([15, 25, 35, 45, 55])
c = torch.LongTensor([68, 88])
x = pad_sequence([a, b, c], batch_first=True)
x

tensor([[10, 20, 30,  0,  0],
        [15, 25, 35, 45, 55],
        [68, 88,  0,  0,  0]])

In [21]:
#　バッチ処理を加えた学習

criterion = nn.CrossEntropyLoss(ignore_index=-1)

net.train()
for ep in range(10):
    loss10B, i = 0.0, 0
    for xs, ys in dataloader: # dataloader:16666, xs:3
        xs1, ys1 = [], []

        # torch.LongTensorは同じデータ長でないと処理できないので1つずつ行う
        for k in range(len(xs)): # 3データ分 / 1 batch
            tid = xs[k]
            xs1.append(torch.LongTensor(tid))
            tid = ys[k]
            ys1.append(torch.LongTensor(tid))
        xs1 = pad_sequence(xs1, batch_first=True).to(device)
        ys1 = pad_sequence(ys1, batch_first=True, padding_value=-1.0)
        output = net(xs1)
        ys1 = ys1.type(torch.LongTensor).to(device)
        
        # loss = 0の代わりに最初のデータの結果をlossに格納
        loss = criterion(output[0], ys1[0])
        for h in range(1, len(ys1)):
            loss += criterion(output[h], ys1[h])
        
        if (i % 10 == 0):
            print(ep, i, loss10B)
            loss10B = 0
        else:
            loss10B += loss.item()

        i += 1
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    outfile = 'lstm-' + str(ep) +".model"
    torch.save(net.state_dict(), outfile)
    

0 0 0.0
0 10 3.170003727078438
0 20 2.659604862332344
0 30 2.6207873448729515
0 40 2.3007833622395992
0 50 2.967858709394932
0 60 2.1223824322223663
0 70 3.124029904603958
0 80 2.7409920543432236
0 90 2.6667971462011337
0 100 3.167276553809643
0 110 2.416996270418167
0 120 3.3218023478984833
0 130 2.4553650990128517
0 140 2.6447716131806374
0 150 3.1531126871705055
0 160 1.9885882548987865
0 170 2.885564651340246
0 180 2.392184592783451
0 190 2.0440089777112007
0 200 2.4432522524148226
0 210 3.409407526254654
0 220 3.019283339381218
0 230 2.925480678677559
0 240 3.4410309493541718
0 250 3.202243462204933
0 260 2.4333035945892334
0 270 3.040529318153858
0 280 2.207898423075676
0 290 1.7020707726478577
0 300 1.978959046304226
0 310 2.111776828765869
0 320 1.7165482677519321
0 330 3.1844301521778107
0 340 2.763259381055832
0 350 2.292396619915962
0 360 1.8671567142009735
0 370 2.912996791303158
0 380 2.6315090097486973
0 390 3.0968024134635925
0 400 2.7137328758835793
0 410 3.560827940702

KeyboardInterrupt: 

In [24]:
batch = [
    ([1, 2, 3], 0),
    ([4, 5, 6], 1),
    ([7, 8, 9], 0),
    ([10, 11, 12], 1)
]

list(zip(batch))

[(([1, 2, 3], 0),), (([4, 5, 6], 1),), (([7, 8, 9], 0),), (([10, 11, 12], 1),)]

In [42]:
lstm = nn.LSTM(200, 200, batch_first=True, num_layers=2)

# lstmが2層なので２バッチ分いる
h0 = torch.randn(2, 1, 200)
c0 = torch.randn(2, 1, 200)
yn, (h0, c0) = lstm(xn, (c0, h0))

In [41]:
xn.shape

torch.Size([1, 6, 200])

## 双方向LSTM

In [48]:
lstm = nn.LSTM(200, 200, batch_first=True, num_layers=2, bidirectional=True)

# 双方向が2層分なので　４バッチ分必要
h0 = torch.randn(4, 1, 200)
c0 = torch.randn(4, 1, 200)
yn, (hn, cn) = lstm(xn, (h0, c0))

print(yn.shape)
print(h0.shape)
print(c0.shape)

torch.Size([1, 6, 400])
torch.Size([4, 1, 200])
torch.Size([4, 1, 200])


In [49]:
xn.shape

torch.Size([1, 6, 200])