# 第9章: RNN, CNN

In [1]:
# import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd

In [2]:
import tensorflow as tf
from tensorflow import keras

## 80. ID番号への変換
問題51で構築した学習データ中の単語にユニークなID番号を付与したい．学習データ中で最も頻出する単語に1，2番目に頻出する単語に2，……といった方法で，学習データ中で2回以上出現する単語にID番号を付与せよ．そして，与えられた単語列に対して，ID番号の列を返す関数を実装せよ．ただし，出現頻度が2回未満の単語のID番号はすべて0とせよ．

In [3]:
# 学習データ読み込み
df_train = pd.read_csv('train.txt', sep='\t')
df_train.head()

Unnamed: 0,CATEGORY,TITLE
0,b,RPT-Fitch Updates EMEA Consumer ABS Rating Cri...
1,e,"Gurlitt Wants to Return Nazi-Looted Art, Suedd..."
2,b,"UPDATE 1-Fairfax Financial, CEO probed over po..."
3,e,Angelina Jolie - Angelina Jolie Will Not Tight...
4,b,Patent Officials Cancel the Washington Redskin...


### 単語の出現回数取得

In [4]:
from sklearn.feature_extraction.text import CountVectorizer

In [5]:
vectorizer = CountVectorizer()
# 計算 & 変換
train_data = vectorizer.fit_transform(df_train['TITLE'].tolist()).toarray()
feature_names = vectorizer.get_feature_names()

In [6]:
df_word_cnt = (
    pd.DataFrame(train_data, columns=feature_names)
    .sum(axis=0) # 出現回数合計
    .reset_index()
)
df_word_cnt.columns=['word', 'cnt']
df_word_cnt

Unnamed: 0,word,cnt
0,00,2
1,05,3
2,07,1
3,08,1
4,0ff,1
...,...,...
12876,zynga,3
12877,zâ,1
12878,œf,1
12879,œlousyâ,1


### 単語にID紐付け

In [7]:
df_temp = (
    df_word_cnt
    .loc[df_word_cnt['cnt'] >= 2, :] # 出現頻度が2回未満の単語は除外
    .sort_values(by='cnt', ascending=False) # 登場が多い順
    .reset_index(drop=True) # indexを順番に並べる
)
df_temp['ID'] = df_temp.index + 1 # ID列追加
df_temp

Unnamed: 0,word,cnt,ID
0,to,2870,1
1,in,1861,2
2,the,1612,3
3,of,1467,4
4,for,1358,5
...,...,...,...
7661,lip,2,7662
7662,liquidity,2,7663
7663,lira,2,7664
7664,listen,2,7665


In [8]:
df_temp[['word', 'ID']].values

array([['to', 1],
       ['in', 2],
       ['the', 3],
       ...,
       ['lira', 7664],
       ['listen', 7665],
       ['knee', 7666]], dtype=object)

In [9]:
trans_dict = {w: i for w, i in df_temp[['word', 'ID']].values}

### 関数作成

In [10]:
from typing import List

In [21]:
# 与えられた単語列に対して，ID番号の列を返す関数
def trans_text_2_ids(text: str, trans_dict: dict) -> List:
    words = text.split(' ') # 単語列をリスト化
    ret_list = []
    for w in words:
        try:
            ret_list.append(str(trans_dict[w]))
        except KeyError: # 該当単語がなければ0
            ret_list.append('0')
    return ' '.join(ret_list)

print(trans_text_2_ids('to in lira knee aaaaa', trans_dict))

1 2 7664 7666 0


In [25]:
trans_text_2_ids('Patent Officials Cancel the Washington', trans_dict)

'0 0 0 3 0'

## 81. RNNによる予測