# 第9章: RNN, CNN

In [1]:
# import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd

In [2]:
import tensorflow as tf
from tensorflow import keras

## 80. ID番号への変換
問題51で構築した学習データ中の単語にユニークなID番号を付与したい．学習データ中で最も頻出する単語に1，2番目に頻出する単語に2，……といった方法で，学習データ中で2回以上出現する単語にID番号を付与せよ．そして，与えられた単語列に対して，ID番号の列を返す関数を実装せよ．ただし，出現頻度が2回未満の単語のID番号はすべて0とせよ．

In [3]:
# 学習データ読み込み
df_train = pd.read_csv('train.txt', sep='\t')
df_train.head()

Unnamed: 0,CATEGORY,TITLE
0,b,RPT-Fitch Updates EMEA Consumer ABS Rating Cri...
1,e,"Gurlitt Wants to Return Nazi-Looted Art, Suedd..."
2,b,"UPDATE 1-Fairfax Financial, CEO probed over po..."
3,e,Angelina Jolie - Angelina Jolie Will Not Tight...
4,b,Patent Officials Cancel the Washington Redskin...


### 単語の出現回数取得

In [4]:
from sklearn.feature_extraction.text import CountVectorizer

In [5]:
vectorizer = CountVectorizer()
# 計算 & 変換
train_data = vectorizer.fit_transform(df_train['TITLE'].tolist()).toarray()
feature_names = vectorizer.get_feature_names()

In [6]:
df_word_cnt = (
    pd.DataFrame(train_data, columns=feature_names)
    .sum(axis=0) # 出現回数合計
    .reset_index()
)
df_word_cnt.columns=['word', 'cnt']
df_word_cnt

Unnamed: 0,word,cnt
0,00,2
1,05,3
2,07,1
3,08,1
4,0ff,1
...,...,...
12876,zynga,3
12877,zâ,1
12878,œf,1
12879,œlousyâ,1


### 単語にID紐付け

In [7]:
df_temp = (
    df_word_cnt
    .loc[df_word_cnt['cnt'] >= 2, :] # 出現頻度が2回未満の単語は除外
    .sort_values(by='cnt', ascending=False) # 登場が多い順
    .reset_index(drop=True) # indexを順番に並べる
)
df_temp['ID'] = df_temp.index + 1 # ID列追加
df_temp

Unnamed: 0,word,cnt,ID
0,to,2870,1
1,in,1861,2
2,the,1612,3
3,of,1467,4
4,for,1358,5
...,...,...,...
7661,lip,2,7662
7662,liquidity,2,7663
7663,lira,2,7664
7664,listen,2,7665


In [8]:
df_temp[['word', 'ID']].values

array([['to', 1],
       ['in', 2],
       ['the', 3],
       ...,
       ['lira', 7664],
       ['listen', 7665],
       ['knee', 7666]], dtype=object)

In [9]:
trans_dict = {w: i for w, i in df_temp[['word', 'ID']].values}

### 関数作成

In [10]:
from typing import List

In [11]:
# 与えられた単語列に対して，ID番号の列を返す関数
def trans_text_2_ids(text: str, trans_dict: dict) -> str:
    words = text.split(' ') # 単語列をリスト化
    ret_list = []
    for w in words:
        try:
            ret_list.append(trans_dict[w])
        except KeyError: # 該当単語がなければ0
            ret_list.append(0)
    return ret_list

print(trans_text_2_ids('to in lira knee aaaaa', trans_dict))

[1, 2, 7664, 7666, 0]


## 81. RNNによる予測

### 文字列を単語列（単語のID番号のone-hot表記）に変換

In [12]:
# 学習データをID番号の列に変換
ids_train = df_train['TITLE'].apply(trans_text_2_ids, trans_dict=trans_dict).values

In [13]:
ids_train[:5]

array([list([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
       list([0, 0, 1, 0, 0, 0, 0, 0]),
       list([0, 0, 0, 0, 2495, 23, 609, 1354, 308]),
       list([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
       list([0, 0, 0, 3, 0, 0, 0, 0])], dtype=object)

In [14]:
# 一番長い文言の単語数
t = max(map(len, ids_train))
t

125

In [15]:
# 長さ揃えてnumpy配列に入れる
temp = []
for x in ids_train:
    # 一番長い長さに合わせる(0埋め)
    while len(x) < 125:
        x.append(0)
    temp.append(x)
x_train = np.array(temp)

In [16]:
x_train

array([[  0,   0,   0, ...,   0,   0,   0],
       [  0,   0,   1, ...,   0,   0,   0],
       [  0,   0,   0, ...,   0,   0,   0],
       ...,
       [  0,   0, 461, ...,   0,   0,   0],
       [  0,   0,   0, ...,   0,   0,   0],
       [  0,   0,   0, ...,   0,   0,   0]])

In [17]:
x_train.shape

(10672, 125)

In [18]:
np.max(x_train)

7666

### 予測モデル作成

In [19]:
dim_w = 300
dim_h = 50
dim_v = np.max(x_train) + 1
label_num = 4

In [20]:
# h0
initial_state = tf.constant(np.zeros(dim_h))
initial_state

<tf.Tensor: shape=(50,), dtype=float64, numpy=
array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])>

In [21]:
model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(dim_v, dim_w),
    tf.keras.layers.SimpleRNN(
        units=dim_h,
        activation='tanh',
        return_sequences=False,
    ),
    tf.keras.layers.Dense(label_num, use_bias=True),
    tf.keras.layers.Softmax(),
])

In [22]:
# model.variables

In [23]:
# yを計算する
y_pred = model(x_train)
y_pred

<tf.Tensor: shape=(10672, 4), dtype=float32, numpy=
array([[0.293775  , 0.23950885, 0.21504033, 0.2516758 ],
       [0.29383075, 0.2395006 , 0.21549168, 0.25117695],
       [0.2932498 , 0.24055214, 0.21551496, 0.25068316],
       ...,
       [0.29352468, 0.23973052, 0.21510008, 0.25164467],
       [0.293775  , 0.23950885, 0.21504033, 0.2516758 ],
       [0.293775  , 0.23950885, 0.21504033, 0.2516758 ]], dtype=float32)>

In [24]:
y_pred.shape

TensorShape([10672, 4])

## 82. 確率的勾配降下法による学習
確率的勾配降下法（SGD: Stochastic Gradient Descent）を用いて，問題81で構築したモデルを学習せよ．訓練データ上の損失と正解率，評価データ上の損失と正解率を表示しながらモデルを学習し，適当な基準（例えば10エポックなど）で終了させよ．