# RNN を使ったテキスト分類

In [None]:
# !pip install -q tf-nightly
# import tensorflow_datasets as tfds
# !pip3 list | grep tensorflow
# import tensorflow as tf

# !pip3 install sentencepiece
import sentencepiece as spm

import json
import os
import pandas as pd

In [None]:
## 読み込み元
save_m_dir = os.path.join('_logs', 
                # '_best_weight'
                # '0715_13h43m01s_En-k_Bi_400x256'
                '0711_17h36m40s_Math-k_Bi_400x256'
                # '0708_02h38m30s_En-a'
                # '0708_03h42m45s_Math-a'
            )

with open(os.path.join(save_m_dir, 'model_params.json'), 'r') as param_f:
    param_dict = json.load(param_f)


## Data


### load

In [None]:
XML_param = param_dict['Experiment']['XML条件']
sbj_names = XML_param['sbj_names']
elmnt_name = XML_param['elmnt_name']
attr_name = XML_param['attr_name']
is_remove_xml = XML_param['is_remove_xml']

df = None
for sbj in sbj_names:
    print(sbj)
    attr_csv_path = f'../{sbj}_{attr_name}_ds.tsv'
    df_tmp = pd.read_csv(attr_csv_path, delimiter='\t')
    df = pd.concat([df, df_tmp])

print(elmnt_name, elmnt_name)
df = df.reset_index(drop=True)
df = df.dropna()  # nan を削除

In [None]:
## XMLタグ除去
if is_remove_xml:
    import re
    def remove_xml(xml_child_str):
        xml_child_str = re.sub('<.*?>', '', xml_child_str)
        return re.sub('</.*?>', '', xml_child_str)

    # test_idx = 200
    # print(df['contents'][test_idx])
    # print(remove_xml(df['contents'][test_idx]))
    df['contents'] = df['contents'].map(remove_xml)
df

### Tokenize
SentencePiece を使用。

In [None]:
m_dir = '_logs/SentencePiece'
os.makedirs(m_dir, exist_ok=True)
df.to_csv(f'{m_dir}/tmp.txt', sep='\t')

# arg_str = '--input={m_dir}/tmp.txt --model_prefix={m_dir}/m_user ' + '--user_defined_symbols=<sep>,<cls>' + ',<ansColumn/>,<label>' + ' --vocab_size=2000'
# spm.SentencePieceTrainer.train(arg_str)
spm.SentencePieceTrainer.train(f'--input={m_dir}/tmp.txt --model_prefix={m_dir}/m  --user_defined_symbols=<sep>,<cls>,<pad>   --vocab_size=2000')
sp = spm.SentencePieceProcessor()  # model_file='SentencePiece/test_model.model'

sp.load(f'{m_dir}/m.model')

In [None]:
# encode: text => id
tokenized_tokens =  sp.encode_as_pieces('次の問い(問１～３)の会話の 17 ～ 19 に入れるのに最も適当なものを，それぞれ以下の①～④のうちから一つずつ選べ。	')
print(tokenized_tokens)

tokenized_ids = sp.encode_as_ids('次の問い(問１～３)の会話の 17 ～ 19 に入れるのに最も適当なものを，それぞれ以下の①～④のうちから一つずつ選べ。	')
print(tokenized_ids)

decoded_text = sp.decode(tokenized_ids)
print(decoded_text)

In [None]:
# example_content = df_tmp['contents'][20]
# print(example_content, sp.encode_as_pieces(example_content))

# for index in encoded_string:
#   print('{} ----> {}'.format(index, encoder.decode([index])))

## Train 用データの準備

In [None]:
word2index = {}
# 系列を揃えるためのパディング文字列<pad>を追加
# パディング文字列のIDは0とする
word2index.update({"<pad>":0})

for inst, cont in zip(df['<instruction/>'], df['contents']):
#     try:
    tokens = sp.encode_as_pieces(inst + cont)
    for word in tokens:
            if word in word2index: continue
            word2index[word] = len(word2index)
#     except TypeError:
#         print(f'[Error] <instruction/> が nan です。')
#         print(f'    inst : {inst}')
#         print(f'    cont : {cont}')

print("vocab size : ", len(word2index))

In [None]:
## set_dict から自動抽出する！
categories = set()
for sbj in sbj_names:
    with open(f'../class_set/{sbj}-{elmnt_name}-{attr_name}.json') as f:
        categories |= set(json.load(f))   # sbj_names = ['Eigo', ]

categories = list(categories)
categories.sort()    # 入れないと、クラス番号が変わってしまい、再現実験ができないので注意？
print(categories)
print(len(categories))

In [None]:
## 系列の長さを揃えてバッチでまとめる
from sklearn.model_selection import train_test_split
import numpy as np
import random
from tqdm import tqdm

cat2index = {}
for cat in categories:
    if cat in cat2index: continue
    cat2index[cat] = len(cat2index)

def sentence2index(sentence):
    tokens = sp.encode_as_pieces(sentence)
    # print(tokens)
    return [word2index[w] for w in tokens]

def category2index(cat):
    return cat2index[cat]

index_datasets_c_xml_tmp = []
index_datasets_category = []

# 系列の長さの最大値を取得。この長さに他の系列の長さをあわせる
max_len = 0
for inst, cont, category in tqdm(zip(df['<instruction/>'], df['contents'], df[attr_name])):
    index_c_xml = sentence2index(inst + cont)
    index_category = category2index(category)
    index_datasets_c_xml_tmp.append(index_c_xml)
    index_datasets_category.append(index_category)
    if max_len < len(index_c_xml):
        max_len = len(index_c_xml)
        # if max_len > 10000:
        #     print(inst, cont)

# 系列の長さを揃えるために短い系列にパディングを追加
index_datasets_c_xml = []
for c_xml in tqdm(index_datasets_c_xml_tmp):
    # パディング作成
    padd = [0] * (max_len - len(c_xml))
    # 後ろパディングだと正しく学習できなかったので、前パディング
    c_xml = padd + c_xml # 前パディング
    # c_xml = c_xml + padd # 後ろパディング
#     print(len(c_xml))
    index_datasets_c_xml.append(c_xml)


In [None]:
import torch

# train/valid に分割する？ or 全データを使う？
is_split_train_test = [
#     True
    False
][0]

if is_split_train_test:
    x_train, x_test, y_train, y_test = train_test_split(index_datasets_c_xml, index_datasets_category, train_size=0.7)

    x_train = torch.tensor(x_train)
    y_train = torch.tensor(y_train)
    x_test = torch.tensor(x_test)
    y_test = torch.tensor(y_test)

    # from sklearn.preprocessing import StandardScaler
    # x_train = np.array(x_train)
    # y_train = np.array(y_train)
    # x_test = np.array(x_test)
    # y_test = np.array(y_test)
    # print(x_train[:5])

    # 特徴量の標準化
    # scaler = StandardScaler()
    # scaler.fit(x_train)
    # x_train = scaler.transform(x_train)
    # x_test = scaler.transform(x_test)

    # Tensor型に変換
    # x_train = torch.from_numpy(x_train)
    # y_train = torch.from_numpy(y_train)
    # x_test = torch.from_numpy(x_test)
    # y_test = torch.from_numpy(y_test)

    # print('x_train : ', x_train.shape)
    # print('y_train : ', y_train.shape)
    # print('x_test : ', x_test.shape)
    # print('y_test : ', y_test.shape)
    # print(x_train[:5])
    print(type(x_train))

else:
    x_test = torch.tensor(index_datasets_c_xml)
    y_test = torch.tensor(index_datasets_category)

In [None]:
###  Dataset  ###
from torch.utils.data import TensorDataset

test_dataset = TensorDataset(x_test, y_test)
if is_split_train_test:
    train_dataset = TensorDataset(x_train, y_train)

# 動作確認
index = 0
print(len(test_dataset))
print(test_dataset.__getitem__(index)[0].size())
print(test_dataset.__getitem__(index)[1])

## Model

In [None]:
import torch
import torch.optim as optim

%load_ext autoreload
from model_abc.LSTM_text_classify_model import (
    LSTM_TextClassifier_ptModel,
    BiLSTM_TextClassifier_ptModel
)
%autoreload

# GPUを使うために必要
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# モデルのハイパーパラメータ
model_param = param_dict['Model']
EMBEDDING_DIM = model_param['EMBEDDING_DIM']
HIDDEN_DIM = model_param['HIDDEN_DIM']
VOCAB_SIZE = len(word2index)
TAG_SIZE = len(categories)
MODEL_NAME = model_param['MODEL_NAME']

## モデルの保存場所を準備する。
def get_model():
    if MODEL_NAME == 'LSTM':
        model = LSTM_TextClassifier_ptModel(EMBEDDING_DIM, HIDDEN_DIM, VOCAB_SIZE, TAG_SIZE).to(device)
    elif MODEL_NAME == 'BiLSTM':
        model = BiLSTM_TextClassifier_ptModel(EMBEDDING_DIM, HIDDEN_DIM, VOCAB_SIZE, TAG_SIZE).to(device)
    return model

# model = LSTM_TextClassifier_ptModel(EMBEDDING_DIM, HIDDEN_DIM, VOCAB_SIZE, TAG_SIZE).to(device)

---

## Experiment Evaluate

In [None]:
## モデル読み込み
opt = ''  #+ 'latest'

model = get_model()
model.load_weights(
        load_m_path=f'{save_m_dir}/model_weghts{opt}.pth')

import torch.nn as nn
criterion = nn.NLLLoss()

In [None]:
from torch.utils.data import DataLoader
from collections import deque

test_acc = 0
total_count = 0
test_dataloader = DataLoader(test_dataset, batch_size=50, shuffle=False)

y_ans_list = []
y_pred_list = []

with torch.no_grad():
    for idx, (X_batch, Y_batch) in enumerate(test_dataloader):
        test_loss = 0
        test_loss, pred_batch_arr = model.predict(X_batch, Y_batch, criterion, device)
        # acc を計算する。
        _, pred_batch = torch.max(pred_batch_arr, 1)
        for j, ans in enumerate(Y_batch):
            if pred_batch[j].item() == ans.item():
                test_acc += 1
            else:
                print(categories[predicts[j].item()], ans.item())
        total_count += Y_batch.size(0)
        y_ans_list += Y_batch.tolist()
        y_pred_list += pred_batch.tolist()
    test_acc /= total_count

print(f"[Info] acc : {test_acc},  loss : {test_loss}")

In [None]:
# print(f"[Info] y_ans_list : {y_ans_list}")
# print(f"[Info] y_pred_list : {y_pred_list}")

---

## Experiment Analyze
- [x]  （棒グラフ）「クラスごとのデータ数」を作る。
- [x]  （ヒートマップ）「混同行列」 を出す（おそらく、かなりの偏りがあるはず）
- [x]  よく間違えているデータの列挙。
- [ ]  Attention などを挟んで、注目単語を可視化する？

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

do_analyze_graph = [
    # True
    False
][0]

do_analyze_statistic = [
    True
    # False
][0]


### （棒グラフ）「クラスごとのデータ数」

In [None]:
if do_analyze:
    w = len(categories)
    h = w
    plt.figure(figsize=(w,h))

    g = sns.countplot(x=attr_name, data=df, 
                      order=df[attr_name].value_counts().index)
    g.set_xticklabels(g.get_xticklabels(), rotation=90)


In [None]:
if do_analyze:
    g = sns.countplot(x=attr_name, data=df, 
                      order=categories)
    g.set_xticklabels(g.get_xticklabels(), rotation=90)


In [None]:
# classification_report
report = classification_report(
                y_ans_list, y_pred_list, 
                # labels=categories
            )
print(report)

### （ヒートマップ）「混同行列」 を出す

In [None]:
cm = confusion_matrix(y_ans_list, y_pred_list)
cm = pd.DataFrame(data=cm, index=categories, columns=categories)

In [None]:
if do_analyze:
    w = len(categories)
    h = w * 7 / 10
    plt.figure(figsize=(w,h))
    sns.heatmap(cm, square=True, cbar=True, annot=True, cmap='Blues')
    plt.savefig('sklearn_confusion_matrix.png')


### よく間違えているデータの列挙。

___