In [1]:
import numpy as np
import csv
import glob
import pickle

import re
import torch

from pyknp import Juman

def modification(word) :
    modified = [word]
    return modified

def decomposition_file(file) :
    jumanpp = Juman()
    f=open(file, 'r')
    df1 = csv.reader(f)
    data = [ v for v in df1]
    print('number of rows :', len(data))

    parts = []
    for i in range(len(data)) :
        data[i][0] = data[i][0].replace(' ', '')
        if len(data[i][0].encode('utf-8')) <= 4096 :
            result = jumanpp.analysis(data[i][0])
        else :
            print(i, ' skip')
            continue
        for mrph in result.mrph_list():
            parts += modification(mrph.midasi)
        if i % 5000 == 0 :
            print(i)
    return parts

def decomposition_sentence(sentence) :
    jumanpp = Juman()
    data = sentence
    print('number of rows :', len(data))

    parts = []
    for i in range(1) :
        data = data.replace(' ', '')
        if len(data.encode('utf-8')) <= 4096 :
            result = jumanpp.analysis(data)
        else :
            print(i, ' skip')
            continue
        for mrph in result.mrph_list():
            parts += modification(mrph.midasi)
        if i % 5000 == 0 :
            print(i)
    return parts

# 単語ID辞書を作成する
word2index = {}

file_list=glob.glob('tweet/tweet2020-11-28.txt')
file_list.sort()
print(len(file_list))

parts_list = []
for j in range(len(file_list)) :
    print(file_list[j])
    wakati = decomposition_file(file_list[j])
    for word in wakati:
        if word in word2index: continue
        word2index[word] = len(word2index)
print("vocab size : ", len(word2index))

0
vocab size :  0


In [6]:
# 文章を単語IDの系列データに変換
# PyTorchのLSTMのインプットになるデータなので、もちろんtensor型で
def sentence2index(sentence):
    wakati = decomposition_sentence(sentence)
    return torch.tensor([word2index[w] for w in wakati], dtype=torch.long)

# テスト
test = "例のあのメニューも！ニコニコ超会議のフードコートメニュー14種類紹介（前半）"
print(sentence2index(test))
# tensor([11320,     3,   449,  5483,    26,  3096,  1493,  1368,     3, 11371, 7835,   174,  8280])

number of rows : 38
0
tensor([14066,  1239,  2490,    29,    88,  8210,  2823,  6110,     5, 10414,
        17038,  2490,  7240,   806,  3736,   555,   116,   560])


In [13]:
import torch.nn as nn
# 全単語数を取得
VOCAB_SIZE = len(word2index)
# 単語のベクトル数
EMBEDDING_DIM = 10
test = "私の前に立った僕「僕はAKBの高橋を守る」"
# 単語IDの系列データに変換
inputs = sentence2index(test)
# 各単語のベクトルをまとめて取得
embeds = nn.Embedding(VOCAB_SIZE, EMBEDDING_DIM)
sentence_matrix = embeds(inputs)
print(sentence_matrix.size())
print(sentence_matrix)

number of rows : 21
0
torch.Size([15, 10])
tensor([[ 0.4359, -1.4191,  1.4828, -0.0416,  1.9151,  0.4492,  0.0336,  0.2881,
          0.8882, -1.2818],
        [ 0.4232,  1.2295, -0.3105, -0.1977,  0.7173,  2.7642, -0.1055,  0.5783,
          0.3969, -1.0393],
        [-0.5765, -1.0940, -1.4396,  2.5868, -0.1009,  2.5808,  1.0113, -0.7437,
          0.5795, -0.5021],
        [-1.8677, -0.8596,  0.4068,  0.8959,  0.0293,  0.0527, -0.0734,  0.3114,
         -1.1921, -2.2002],
        [ 0.8610,  0.4022,  0.4114,  2.0507, -0.9807,  0.1996, -0.7592, -0.4125,
          0.8014,  1.0649],
        [ 0.5940, -1.5065, -1.8374, -1.4471, -0.5120, -0.4337, -0.1018, -1.2060,
          1.9400,  0.7027],
        [-0.5200,  1.0667,  1.0329,  0.1118, -0.3006,  0.6896,  2.5931, -0.2750,
          0.3796,  0.6671],
        [ 0.5940, -1.5065, -1.8374, -1.4471, -0.5120, -0.4337, -0.1018, -1.2060,
          1.9400,  0.7027],
        [-0.8385, -0.6899, -0.6546, -1.0359, -0.9235,  0.8911, -0.9508,  1.6996,
    