## 環境構築パート


In [None]:
!pip uninstall -y tensorflow
!pip uninstall -y keras
!pip install tensorflow==2.2
!pip install keras=2.2

In [1]:
import tensorflow as tf
print(tf.__version__)

2.2.0


#### ここでランタイムを再起動させる

In [None]:
!pip install tensorflow_addons==0.11.2
!pip install -U numpy==1.18.5
!pip install 'kashgari>=2.0.2'
!pip install sentencepiece

#### driveのマウント

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
%cd "drive/My Drive/Colab Notebooks/self_compile"

/content/drive/My Drive/Colab Notebooks/self_compile


In [5]:
%ls

[0m[01;34minputs[0m/  self_compile.ipynb


#### データの前処理

In [40]:
import codecs
import re
import requests
from bs4 import BeautifulSoup

##### txtファイルの前処理(データ形式の統一->後々この処理いらない可能性が出てきたが一旦保留)
corpus_itest = []
with open('inputs/input_test.txt') as f:
    lines = f.read()
    for l in lines.split("\n"):
      l = l.lstrip()
      corpus_itest.append(l)
# ファイルの保存       
print(*corpus_itest, sep="\n", file=codecs.open("itest.txt", "w", "utf-8"))
corpus_ctest = []
with open('inputs/collect_test.txt') as f:
    lines = f.read()
    for l in lines.split("\n"):
      l = l.lstrip()
      corpus_ctest.append(l)
# ファイルの保存       
print(*corpus_ctest, sep="\n", file=codecs.open("ctest.txt", "w", "utf-8"))

######## sentencepieceによるデータの単語分割
"""
  sentencepieceによる単語の分割は一般的な文章に対しては良いのかもしれないが今回のような
  プログラム体のテキストデータに対してはどうなのだろうというところがあるためここの分割方法に
  ついては別途要検討である可能性がある
"""
import sentencepiece as sp
# 学習の実行 - 非complete版
sp.SentencePieceTrainer.Train(
   input="itest.txt",
   model_prefix="sentencepiece",
   model_type="word",
   vocab_size=82,
   add_dummy_prefix="false", #先頭の_を入れるか入れないか => 先頭に来る単語と文中に来る単語で意味が異なる可能性によってtrue,　falseをきめる
)
#モデルの作成
sp_itest = sp.SentencePieceProcessor()
sp_itest.Load("sentencepiece.model")

# 学習の実行 - complete版
sp.SentencePieceTrainer.Train(
   input="ctest.txt",
   model_prefix="sentencepiece",
   model_type="word",
   vocab_size=82,
   add_dummy_prefix="false", #先頭の_を入れるか入れないか => 先頭に来る単語と文中に来る単語で意味が異なる可能性によってtrue,　falseをきめる
)
#モデルの作成
sp_ctest = sp.SentencePieceProcessor()
sp_ctest.Load("sentencepiece.model")

######### kashgariに埋め込むためのデータ整形
test_x = []
test_y = []
for i in range(len(corpus_ctest)):
  input_num = sp_itest.EncodeAsPieces(corpus_itest[i])
  collect_num = sp_ctest.EncodeAsPieces(corpus_ctest[i])
  if (len(input_num) != 0):
   #if (len(input_num) == len(collect_num)):
   test_x.append(input_num)
   tmp_y = []
   for j in range(len(input_num)):
     if (input_num[j] == collect_num[j]):
       tmp_y.append('C')
     else:
       tmp_y.append('X')
   test_y.append(tmp_y)

In [41]:
import codecs
import re
import requests
from bs4 import BeautifulSoup

corpus_itrain = []
with open('inputs/input_train.txt') as f:
    lines = f.read()
    
    for l in lines.split("\n"):
      l = l.lstrip()
      corpus_itrain.append(l)

# ファイルの保存       
print(*corpus_itrain, sep="\n", file=codecs.open("itrain.txt", "w", "utf-8"))
corpus_ctrain = []
with open('inputs/collect_train.txt') as f:
    lines = f.read()
    
    for l in lines.split("\n"):
      l = l.lstrip()
      corpus_ctrain.append(l)

# ファイルの保存       
print(*corpus_ctrain, sep="\n", file=codecs.open("ctrain.txt", "w", "utf-8"))
import sentencepiece as sp

# 学習の実行 - 非complete版
sp.SentencePieceTrainer.Train(
   input="itrain.txt",
   model_prefix="sentencepiece",
   model_type="word",
   vocab_size=82,
   add_dummy_prefix="false", #先頭の_を入れるか入れないか => 先頭に来る単語と文中に来る単語で意味が異なる可能性によってtrue,　falseをきめる
)
#モデルの作成
sp_itrain = sp.SentencePieceProcessor()
sp_itrain.Load("sentencepiece.model")

# 学習の実行 - complete版
sp.SentencePieceTrainer.Train(
   input="ctrain.txt",
   model_prefix="sentencepiece",
   model_type="word",
   vocab_size=82,
   add_dummy_prefix="false", #先頭の_を入れるか入れないか => 先頭に来る単語と文中に来る単語で意味が異なる可能性によってtrue,　falseをきめる
)
#モデルの作成
sp_ctrain = sp.SentencePieceProcessor()
sp_ctrain.Load("sentencepiece.model")
train_x = []
train_y = []
for i in range(len(corpus_ctrain)):
  input_num = sp_itrain.EncodeAsPieces(corpus_itrain[i])
  collect_num = sp_ctrain.EncodeAsPieces(corpus_ctrain[i])
  if (len(input_num) != 0):
   #if (len(input_num) == len(collect_num)):
   train_x.append(input_num)
   tmp_y = []
   for j in range(len(input_num)):
     if (input_num[j] == collect_num[j]):
       tmp_y.append('C')
     else:
       tmp_y.append('X')
   train_y.append(tmp_y)

In [42]:
import codecs
import re
import requests
from bs4 import BeautifulSoup

corpus_ivalid = []
with open('inputs/input_valid.txt') as f:
    lines = f.read()
    
    for l in lines.split("\n"):
      l = l.lstrip()
      corpus_ivalid.append(l)

# ファイルの保存       
print(*corpus_ivalid, sep="\n", file=codecs.open("ivalid.txt", "w", "utf-8"))
corpus_cvalid = []
with open('inputs/collect_valid.txt') as f:
    lines = f.read()
    
    for l in lines.split("\n"):
      l = l.lstrip()
      corpus_cvalid.append(l)

# ファイルの保存       
print(*corpus_cvalid, sep="\n", file=codecs.open("cvalid.txt", "w", "utf-8"))
import sentencepiece as sp

# 学習の実行 - 非complete版
sp.SentencePieceTrainer.Train(
   input="ivalid.txt",
   model_prefix="sentencepiece",
   model_type="word",
   vocab_size=82,
   add_dummy_prefix="false", #先頭の_を入れるか入れないか => 先頭に来る単語と文中に来る単語で意味が異なる可能性によってtrue,　falseをきめる
)
#モデルの作成
sp_ivalid = sp.SentencePieceProcessor()
sp_ivalid.Load("sentencepiece.model")

# 学習の実行 - complete版
sp.SentencePieceTrainer.Train(
   input="cvalid.txt",
   model_prefix="sentencepiece",
   model_type="word",
   vocab_size=82,
   add_dummy_prefix="false", #先頭の_を入れるか入れないか => 先頭に来る単語と文中に来る単語で意味が異なる可能性によってtrue,　falseをきめる
)
#モデルの作成
sp_cvalid = sp.SentencePieceProcessor()
sp_cvalid.Load("sentencepiece.model")

valid_x = []
valid_y = []
for i in range(len(corpus_cvalid)):
  input_num = sp_ivalid.EncodeAsPieces(corpus_ivalid[i])
  collect_num = sp_cvalid.EncodeAsPieces(corpus_cvalid[i])
  if (len(input_num) != 0):
   #if (len(input_num) == len(collect_num)):
   valid_x.append(input_num)
   tmp_y = []
   for j in range(len(input_num)):
     if (input_num[j] == collect_num[j]):
       tmp_y.append('C')
     else:
       tmp_y.append('X')
   valid_y.append(tmp_y)

### 各入力データのフォーマット
- train_x  
学習データとして与える一文を組み合わせたlist  
自分の例であれば一行のコードをsentencepieceによって分割させたもの
- train_y  
train_xに紐づけるべきOかXが記されたラベルのlist  
listの長さはtrain_xと対応づけなければエラーが出るはずなので慎重に行う必要がある。
- valid_x  
よくわからないけど、trainとは別の補足をするための構造
- valid_y  
valid_xに対応するoxのlist

- test_x  
評価用サンプルデータ、訓練したモデルからこのlistに対して評価を行い各単語に対してscoreを計算する

- test_y  
test_xに対応するoxのlist



In [47]:
import kashgari
from kashgari.tasks.labeling import BiLSTM_Model

model = BiLSTM_Model()

model.fit(train_x, train_y, valid_x, valid_y)
# Evaluate the model

model.evaluate(test_x, test_y)

# Model data will save to `saved_ner_model` folder
model.save('saved_classification_model')

# Load saved model
loaded_model = BiLSTM_Model.load_model('saved_classification_model')
loaded_model.predict(test_x)

Preparing text vocab dict: 100%|██████████| 79216/79216 [00:00<00:00, 722518.66it/s]
Preparing text vocab dict: 100%|██████████| 35031/35031 [00:00<00:00, 695516.60it/s]
2022-08-30 05:10:01,334 [DEBUG] kashgari - --- Build vocab dict finished, Total: 10540 ---
2022-08-30 05:10:01,336 [DEBUG] kashgari - Top-10: ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '▁=', '}', '▁{', '#define', '▁<<', 'int']
Preparing text vocab dict: 100%|██████████| 79216/79216 [00:00<00:00, 984672.24it/s]
Preparing text vocab dict: 100%|██████████| 35031/35031 [00:00<00:00, 932685.84it/s]
2022-08-30 05:10:01,472 [DEBUG] kashgari - --- Build vocab dict finished, Total: 3 ---
2022-08-30 05:10:01,474 [DEBUG] kashgari - Top-10: ['[PAD]', 'C', 'X']
Calculating sequence length: 100%|██████████| 79216/79216 [00:00<00:00, 1656013.80it/s]
Calculating sequence length: 100%|██████████| 35031/35031 [00:00<00:00, 1691443.97it/s]
2022-08-30 05:10:01,583 [DEBUG] kashgari - Calculated sequence length = 10
2022-08-30 05:10:02,836 [DEBUG

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


2022-08-30 05:15:31,096 [DEBUG] kashgari - predict seq_length: None, input: (171, 31)




2022-08-30 05:15:32,695 [DEBUG] kashgari - predict output: (171, 31)
2022-08-30 05:15:32,697 [DEBUG] kashgari - predict output argmax: [[0 1 1 ... 0 0 0]
 [0 1 1 ... 0 0 0]
 [0 1 1 ... 0 0 0]
 ...
 [0 1 1 ... 0 0 0]
 [0 1 1 ... 0 0 0]
 [0 1 0 ... 0 0 0]]



           precision    recall  f1-score   support

        C     0.7778    0.8160    0.7964       163
        X     0.0000    0.0000    0.0000        43

micro avg     0.7778    0.6456    0.7056       206
macro avg     0.6154    0.6456    0.6302       206



2022-08-30 05:15:33,197 [INFO] kashgari - model saved to /content/drive/MyDrive/Colab Notebooks/self_compile/saved_classification_model
2022-08-30 05:15:34,516 [DEBUG] kashgari - predict seq_length: None, input: (171, 31)




2022-08-30 05:15:36,078 [DEBUG] kashgari - predict output: (171, 31)
2022-08-30 05:15:36,080 [DEBUG] kashgari - predict output argmax: [[0 1 1 ... 0 0 0]
 [0 1 1 ... 0 0 0]
 [0 1 1 ... 0 0 0]
 ...
 [0 1 1 ... 0 0 0]
 [0 1 1 ... 0 0 0]
 [0 1 0 ... 0 0 0]]


[['C', 'C'],
 ['C', 'C'],
 ['C', 'C'],
 ['C', 'C'],
 ['C'],
 ['C', 'C'],
 ['C', 'C'],
 ['C', 'C'],
 ['C', 'C'],
 ['C', 'C'],
 ['C', 'C'],
 ['C', 'C'],
 ['C'],
 ['C', 'C'],
 ['C', 'C'],
 ['C', 'C'],
 ['C', 'C'],
 ['C'],
 ['C'],
 ['C', 'C'],
 ['C', 'C'],
 ['C', 'C'],
 ['C', 'C', 'C', 'C'],
 ['C', 'C', 'C', 'C', 'C'],
 ['C', 'C'],
 ['C'],
 ['C', 'C'],
 ['C', 'C', 'C'],
 ['C', 'C', 'C'],
 ['C', 'C', 'C', 'C', 'C', 'C', 'C'],
 ['C', 'C'],
 ['C', 'C'],
 ['C', 'C', 'C', 'C'],
 ['C', 'C', 'C'],
 ['C', 'C', 'C', 'C', 'C', 'C', 'C', 'C'],
 ['C'],
 ['C'],
 ['C',
  'C',
  'C',
  'C',
  'C',
  'C',
  'C',
  'C',
  'C',
  'C',
  'C',
  'C',
  'C',
  'C',
  'C',
  'C',
  'C',
  'C'],
 ['C', 'C', 'C', 'C', 'C', 'C'],
 ['C',
  'C',
  'C',
  'C',
  'C',
  'C',
  'C',
  'C',
  'C',
  'C',
  'C',
  'C',
  'C',
  'C',
  'C',
  'C',
  'C',
  'C',
  'C'],
 ['C',
  'C',
  'C',
  'C',
  'C',
  'C',
  'C',
  'C',
  'C',
  'C',
  'C',
  'C',
  'C',
  'C',
  'C',
  'C',
  'C',
  'C',
  'C',
  'C',
  'C',
  'C',
 

### extra

```py
# テキストを語彙列に分割&&中身の確認
for i in corpus_itest:
  print(sp_itest.EncodeAsPieces(i))
  print(len(sp_itest.EncodeAsPieces(i)))
```

### サンプル動作

In [44]:
from kashgari.corpus import ChineseDailyNerCorpus

ttrain_x, ttrain_y = ChineseDailyNerCorpus.load_data('train')
vvalid_x, vvalid_y = ChineseDailyNerCorpus.load_data('valid')
ttest_x, ttest_y = ChineseDailyNerCorpus.load_data('test')

2022-08-30 03:57:47,047 [DEBUG] kashgari - loaded 20864 samples from /root/.kashgari/datasets/china-people-daily-ner-corpus/example.train. Sample:
x[0]: ['克', '罗', '地', '亚', '政', '府', '2', '4', '日', '正', '式', '向', '阿', '根', '廷', '政', '府', '提', '出', '引', '渡', '在', '阿', '侨', '居', '多', '年', '的', '前', '纳', '粹', '战', '犯', '沙', '基', '奇', '的', '要', '求', '。']
y[0]: ['B-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'I-LOC', 'I-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PER', 'I-PER', 'I-PER', 'O', 'O', 'O', 'O']
2022-08-30 03:57:47,113 [DEBUG] kashgari - loaded 2318 samples from /root/.kashgari/datasets/china-people-daily-ner-corpus/example.dev. Sample:
x[0]: ['陈', '寅', '恪', '曾', '自', '称', '“', '思', '想', '囿', '于', '咸', '丰', '、', '同', '治', '之', '世', '，', '议', '论', '近', '乎', '曾', '湘', '乡', '（', '曾', '国', '藩', '）', '、', '张', '南', '皮', '（', '张', '之', '洞', '）', '之', '间', '”', '。']
y[0]: ['B-PER', 'I-PER',

In [45]:
import kashgari
from kashgari.tasks.labeling import BiLSTM_Model

model = BiLSTM_Model()

model.fit(ttrain_x, ttrain_y, vvalid_x, vvalid_y)
# Evaluate the model

model.evaluate(ttest_x, ttest_y)

Preparing text vocab dict: 100%|██████████| 20864/20864 [00:00<00:00, 108706.81it/s]
Preparing text vocab dict: 100%|██████████| 2318/2318 [00:00<00:00, 102916.27it/s]
2022-08-30 03:58:09,839 [DEBUG] kashgari - --- Build vocab dict finished, Total: 3500 ---
2022-08-30 03:58:09,842 [DEBUG] kashgari - Top-10: ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '，', '的', '。', '国', '一', '、']
Preparing text vocab dict: 100%|██████████| 20864/20864 [00:00<00:00, 145830.24it/s]
Preparing text vocab dict: 100%|██████████| 2318/2318 [00:00<00:00, 144626.87it/s]
2022-08-30 03:58:10,016 [DEBUG] kashgari - --- Build vocab dict finished, Total: 8 ---
2022-08-30 03:58:10,018 [DEBUG] kashgari - Top-10: ['[PAD]', 'O', 'I-ORG', 'I-LOC', 'B-LOC', 'I-PER', 'B-ORG', 'B-PER']
Calculating sequence length: 100%|██████████| 20864/20864 [00:00<00:00, 1098515.71it/s]
Calculating sequence length: 100%|██████████| 2318/2318 [00:00<00:00, 1261011.24it/s]
2022-08-30 03:58:10,075 [DEBUG] kashgari - Calculated sequence length = 97


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


2022-08-30 04:07:59,838 [DEBUG] kashgari - predict seq_length: None, input: (4636, 579)




2022-08-30 04:08:55,472 [DEBUG] kashgari - predict output: (4636, 579)
2022-08-30 04:08:55,473 [DEBUG] kashgari - predict output argmax: [[0 1 1 ... 1 1 1]
 [0 1 1 ... 1 1 1]
 [0 1 1 ... 1 1 1]
 ...
 [0 1 1 ... 1 1 1]
 [0 1 1 ... 1 1 1]
 [0 1 1 ... 1 1 1]]



           precision    recall  f1-score   support

      LOC     0.6537    0.7463    0.6970      3658
      ORG     0.5158    0.5835    0.5476      2185
      PER     0.7413    0.7918    0.7658      1864

micro avg     0.6344    0.7112    0.6706      7707
macro avg     0.6358    0.7112    0.6712      7707



{'detail': {'LOC': {'precision': 0.6537356321839081,
   'recall': 0.7463094587206124,
   'f1-score': 0.6969619606841971,
   'support': 3658},
  'ORG': {'precision': 0.5157766990291263,
   'recall': 0.5835240274599542,
   'f1-score': 0.5475628086751129,
   'support': 2185},
  'PER': {'precision': 0.7413360120542442,
   'recall': 0.7918454935622318,
   'f1-score': 0.7657587548638133,
   'support': 1864}},
 'precision': 0.6358099593066676,
 'recall': 0.7111716621253406,
 'f1-score': 0.6712450899447338,
 'support': 7707}