# 1. データの読み込み

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os
os.chdir("/content/drive/My Drive")
os.listdir()

['PQI Taggle',
 'train.csv',
 'test.csv',
 'Untitled.ipynb',
 '1601556865711.jpg',
 '1601556867726.jpg',
 '1601558074690.jpg',
 '1601992462229.jpg',
 'Kaggle Driver Detection 手法説明.pptx',
 'Kaggle Driver Detection 手法説明.pdf',
 'toxic_comment.zip',
 'Toxic Comment Classification Challenge.mm',
 'train_preprocessing_lower.csv',
 'test_preprocessing_lower.csv',
 'train_preprocessing_upper_allfeature.csv',
 'test_preprocessing_upper_allfeature.csv',
 'submission.csv',
 '20201030ラズパイ講座 完成品.zip',
 'Colab Notebooks',
 'submission_result.csv',
 'test_preprocessing.csv',
 'train_preprocessing.csv',
 'Naive Bayes.ipynb',
 'submission_result_preprocessing.csv',
 'submission_result_preprocessing_lower.csv',
 'submission_result_preprocessing_upper_allfeatires.csv',
 'submission1.csv']

In [None]:
#データの読み込み
import pandas as pd
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
df_test_labels = pd.read_csv('test.csv')
df_test_labels = df_test_labels.set_index('id')

df_submission = pd.read_csv('submission.csv', index_col='id')

df_train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


# 2. ライブラリの読み込み

In [None]:
#ライブラリの読み込み
!pip install transformers==2.3.0



In [None]:
import torch
from transformers import BertTokenizer, BertForMaskedLM
import numpy as np
from tqdm import tqdm
import numpy as np
import pandas as pd
import tensorflow as tf

# 3. 学習データの作成
## 3.1 学習用データ作成の手順の確認
### 3.1.1 input_ids(テキストをエンコードしたID)

In [None]:
from transformers import BertTokenizer
from keras.preprocessing.sequence import pad_sequences

#学習済みモデルを指定
#12-layer, 768-hidden, 12-heads, 110M parameters. Trained on lower-cased English text.
bert_model_name = 'bert-base-uncased'

#パラメータの指定
tokenizer = BertTokenizer.from_pretrained(bert_model_name, do_lower_case=True)
max_seq_len = 128

#以下の2文章にtokenizerを適用する
text='''Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? 
They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't 
remove the template from the talk page since I'm retired now.89.205.38.2'''

text2='''\nMore\nI can\'t make any real suggestions on improvement - I wondered if the section statistics 
should be later on, or a subsection of ""types of accidents""  -I think the references may need tidying so 
that they are all in the exact same format ie date format etc. I can do that later on, if no-one else does first - 
if you have any preferences for formatting style on references or want to do it yourself please let me know.\n\nThere 
appears to be a backlog on articles for review so I guess there may be a delay until a reviewer turns up. It\'s listed 
in the relevant form eg Wikipedia:Good_article_nominations#Transport  '''

In [None]:
#tokenizeで前処理結果が見れる
#前とのつながりを表す##が特徴的
print(tokenizer.tokenize(text))

['explanation', 'why', 'the', 'edit', '##s', 'made', 'under', 'my', 'user', '##name', 'hardcore', 'metallic', '##a', 'fan', 'were', 'reverted', '?', 'they', 'weren', "'", 't', 'van', '##dal', '##isms', ',', 'just', 'closure', 'on', 'some', 'gas', 'after', 'i', 'voted', 'at', 'new', 'york', 'dolls', 'fa', '##c', '.', 'and', 'please', 'don', "'", 't', 'remove', 'the', 'template', 'from', 'the', 'talk', 'page', 'since', 'i', "'", 'm', 'retired', 'now', '.', '89', '.', '205', '.', '38', '.', '2']


In [None]:
#encodeで数値に変換可能
tokenized_sentence = tokenizer.encode(
  text,                      #Sentence to encode.
  add_special_tokens = True, #[CLS]と[SEP]を加える
  max_length = max_seq_len,  #最大長さを指定
  )    
print(tokenized_sentence)

[101, 7526, 2339, 1996, 10086, 2015, 2081, 2104, 2026, 5310, 18442, 13076, 12392, 2050, 5470, 2020, 16407, 1029, 2027, 4694, 1005, 1056, 3158, 9305, 22556, 1010, 2074, 8503, 2006, 2070, 3806, 2044, 1045, 5444, 2012, 2047, 2259, 14421, 6904, 2278, 1012, 1998, 3531, 2123, 1005, 1056, 6366, 1996, 23561, 2013, 1996, 2831, 3931, 2144, 1045, 1005, 1049, 3394, 2085, 1012, 6486, 1012, 16327, 1012, 4229, 1012, 1016, 102]


In [None]:
#各数字がどの単語かを確認
#文頭と文末にそれぞれ[CLS],[SEP]が追加されている
for input_id in tokenized_sentence[:10]:
    print('{}⇒{}'.format(input_id, tokenizer.decode([input_id])))

101⇒[CLS]
7526⇒explanation
2339⇒why
1996⇒the
10086⇒edit
2015⇒##s
2081⇒made
2104⇒under
2026⇒my
5310⇒user


In [None]:
#上記と同様に2つめの文章もencode
tokenized_sentence2 = tokenizer.encode(
  text2,                      #Sentence to encode.
  add_special_tokens = True, #[CLS]と[SEP]を加える
  max_length = max_seq_len,  #最大長さを指定
  )    

tokenized_sentence_list=[]
tokenized_sentence_list.append(tokenized_sentence)
tokenized_sentence_list.append(tokenized_sentence2)

In [None]:
#pad_sequencesでndarrayに変換される
#長い方に合わせて0でpaddingされる

tokenized_and_padded_sentences=pad_sequences(tokenized_sentence_list, maxlen=max_seq_len, dtype="long", value=0, truncating="post", padding="post")
print(tokenized_and_padded_sentences)

[[  101  7526  2339  1996 10086  2015  2081  2104  2026  5310 18442 13076
  12392  2050  5470  2020 16407  1029  2027  4694  1005  1056  3158  9305
  22556  1010  2074  8503  2006  2070  3806  2044  1045  5444  2012  2047
   2259 14421  6904  2278  1012  1998  3531  2123  1005  1056  6366  1996
  23561  2013  1996  2831  3931  2144  1045  1005  1049  3394  2085  1012
   6486  1012 16327  1012  4229  1012  1016   102     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0]
 [  101  2062  1045  2064  1005  1056  2191  2151  2613 15690  2006  7620
   1011  1045  4999  2065  1996  2930  6747  2323  2022  2101  2006  1010
   2030  1037  4942 29015  1997  1000  1000  4127  1997 13436

### 3.1.2 attention_mask(パディング位置)

In [None]:
#0よりも大きければattention_masks=1とする
attention_masks = []

for sentence in tokenized_and_padded_sentences:
  att_mask = [int(token_id > 0) for token_id in sentence]
  attention_masks.append(att_mask)

print(np.asarray(attention_masks))

[[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
  1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
  1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
  1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
  1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]]


## 3.2 [3.1]の処理を全データに適用

In [None]:
#時間がかかるので一部で試行
#df_train=df_train.head(20000)

from transformers import BertTokenizer
from keras.preprocessing.sequence import pad_sequences

bert_model_name = 'bert-base-uncased'

tokenizer = BertTokenizer.from_pretrained(bert_model_name, do_lower_case=True)
MAX_LEN = 128

def tokenize_sentences(sentences, tokenizer, max_seq_len = 128):
    tokenized_sentences = []

    for sentence in tqdm(sentences):
        tokenized_sentence = tokenizer.encode(
                            sentence,                  # Sentence to encode.
                            add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                            max_length = max_seq_len,  # Truncate all sentences.
                    )
        
        tokenized_sentences.append(tokenized_sentence)

    return tokenized_sentences

def create_attention_masks(tokenized_and_padded_sentences):
    attention_masks = []

    for sentence in tokenized_and_padded_sentences:
        att_mask = [int(token_id > 0) for token_id in sentence]
        attention_masks.append(att_mask)

    return np.asarray(attention_masks)

input_ids = tokenize_sentences(df_train['comment_text'], tokenizer, MAX_LEN)
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", value=0, truncating="post", padding="post")
attention_masks = create_attention_masks(input_ids)

100%|██████████| 159571/159571 [04:37<00:00, 574.00it/s]


# 3.3 データの分割

In [None]:
from sklearn.model_selection import train_test_split

#ラベルの6列をlabelsに格納
labels =  df_train[label_cols].values

#trainとtestに分割
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, labels, random_state=0, test_size=0.1)

#attention_maskもtrain用を作成
train_masks, validation_masks, _, _ = train_test_split(attention_masks, labels, random_state=0, test_size=0.1)

train_size = len(train_inputs)
validation_size = len(validation_inputs)

print(train_size)
print(validation_size)

143613
15958


## 3.4 token_type_idsの作成

In [None]:
#ファインチューニングの場合はすべて0とする
token_type_ids=np.zeros(train_inputs.shape)
token_type_ids

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

## 3.5 input_ids, attention_masks, token_type_idsを一つのリストに格納

In [None]:
x_train=[]
x_train.append(train_inputs)
x_train.append(train_masks)
x_train.append(token_type_ids)

#データの確認
print('input_ids:')
print(x_train[0][1])
print('attention_masks:')
print(x_train[1][1])
print('token_type_ids:')
print(x_train[2][1])

input_ids:
[  101  1045  2293  5980  2015  1999  2026 10007  1012  1006  1031  1031
  5310   102     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0]
attention_masks:
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

# 4. Validationデータの作成

In [None]:
#同じことをvalidationデータでも実施
token_type_ids=np.zeros(validation_inputs.shape)
token_type_ids

x_val=[]
x_val.append(validation_inputs)
x_val.append(validation_masks)
x_val.append(token_type_ids)

#データの確認
print('input_ids:')
print(x_val[0][1])
print('attention_masks:')
print(x_val[1][1])
print('token_type_ids:')
print(x_val[2][1])

input_ids:
[ 101 1045 8534 7065 2545  999 1045 8534 7065 2545  999 1045 8534 7065
 2545  999 1045 8534 7065 2545  999 1045 8534 7065 2545  999 1045 8534
 7065 2545  999 1045 8534 7065 2545  999 1045 8534 7065 2545  999 1045
 8534 7065 2545  999 1045 8534 7065 2545  999 1045 8534 7065 2545  999
 1045 8534 7065 2545  999 1045 8534 7065 2545  999 1045 8534 7065 2545
  999 1045 8534 7065 2545  999 1045 8534 7065 2545  999 1045 8534 7065
 2545  999 1045 8534 7065 2545  999 1045 8534 7065 2545  999 1045 8534
 7065 2545  999 1045 8534 7065 2545  999 1045 8534 7065 2545  999 1045
 8534 7065 2545  999 1045 8534 7065 2545  999 1045 8534 7065 2545  999
 1045  102]
attention_masks:
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
token_type_ids:
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0

# 5. モデルの作成、学習

In [None]:
#Epoch終わりで、AUCを評価するためのクラスを定義する
from keras.callbacks import Callback
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix

class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__() #Callbackのinitメソッドを呼び出し

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        #intervalで割り切れるエポックの時のみ
        if epoch % self.interval == 0:
            #validationデータを評価して、AUCを出力する
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)

            print("\n ROC-AUC - epoch: %d - score: %.6f \n" % (epoch+1, score))
            print("toxic Confusion matrix=====================================")
            print(confusion_matrix(self.y_val[:,0], y_pred[:,0]>0.5))
            print("ROC-AUC:",roc_auc_score(self.y_val[:,0], y_pred[:,0]))

            print("severe_toxic Confusion matrix==============================")
            print(confusion_matrix(self.y_val[:,1], y_pred[:,1]>0.5))
            print("ROC-AUC:",roc_auc_score(self.y_val[:,1], y_pred[:,1]))

            print("obscene Confusion matrix===================================")
            print(confusion_matrix(self.y_val[:,2], y_pred[:,2]>0.5))
            print("ROC-AUC:",roc_auc_score(self.y_val[:,2], y_pred[:,2]))

            print("threat Confusion matrix====================================")
            print(confusion_matrix(self.y_val[:,3], y_pred[:,3]>0.5))
            print("ROC-AUC:",roc_auc_score(self.y_val[:,3], y_pred[:,3]))

            print("insult Confusion matrix====================================")
            print(confusion_matrix(self.y_val[:,4], y_pred[:,4]>0.5))
            print("ROC-AUC:",roc_auc_score(self.y_val[:,4], y_pred[:,4]))

            print("identity_hate Confusion matrix=============================")
            print(confusion_matrix(self.y_val[:,5], y_pred[:,5]>0.5))
            print("ROC-AUC:",roc_auc_score(self.y_val[:,5], y_pred[:,5]))
            print('')
            print('')

#AUCクラスをインスタンス化
RocAuc = RocAucEvaluation(validation_data=(x_val,validation_labels), interval=1)

In [None]:
import transformers
#input_shapeを指定
input_shape = (128, )

#num_classes=6クラス
num_classes=len(label_cols)

#modelの作成
model_name='bert-base-uncased'
input_ids = tf.keras.layers.Input(input_shape, dtype=tf.int32)
attention_mask = tf.keras.layers.Input(input_shape, dtype=tf.int32)
token_type_ids = tf.keras.layers.Input(input_shape, dtype=tf.int32)
bert_model = transformers.TFBertModel.from_pretrained(model_name)

#以下のように、転移学習も可能
#bert_model.trainable = False

#last_hidden_state（モデルの最後の隠れ状態）、pooler_output(CLSと呼ばれる文章の平均値)、
#hidden_states(全隠れ状態)、attentions(Attentionレイヤの出力)が入っているのでpooler_outputを文章の要約として全結合層に渡す
last_hidden_state, pooler_output = bert_model(
    input_ids,
    attention_mask=attention_mask,
    token_type_ids=token_type_ids
    )

#Activationはsigmoid
output = tf.keras.layers.Dense(num_classes, activation="sigmoid")(pooler_output)

#モデルをコンパイル
model = tf.keras.Model(inputs=[input_ids, attention_mask, token_type_ids], outputs=[output])
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-05, clipnorm=1.0)
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=["acc"])

#モデルを確認
print(model.summary())

# 訓練
model.fit(
    x_train,
    train_labels,
    batch_size=32,
    epochs=1,
    validation_data=(x_val,validation_labels),
    callbacks=[RocAuc],
)

Model: "functional_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_4 (InputLayer)            [(None, 128)]        0                                            
__________________________________________________________________________________________________
input_5 (InputLayer)            [(None, 128)]        0                                            
__________________________________________________________________________________________________
input_6 (InputLayer)            [(None, 128)]        0                                            
__________________________________________________________________________________________________
tf_bert_model_1 (TFBertModel)   ((None, 128, 768), ( 109482240   input_4[0][0]                    
                                                                 input_5[0][0]         

<tensorflow.python.keras.callbacks.History at 0x7f81180c9208>

# 6. 推論の実行

In [None]:
#推論を実施
#df_test=df_test.head(2000)
#df_submission=df_submission.head(2000)

In [None]:
#学習データと同様の前処理を実行
test_input_ids = tokenize_sentences(df_test['comment_text'], tokenizer, MAX_LEN)
test_input_ids = pad_sequences(test_input_ids, maxlen=MAX_LEN, dtype="long", value=0, truncating="post", padding="post")
test_attention_masks = create_attention_masks(test_input_ids)

100%|██████████| 153164/153164 [04:15<00:00, 599.05it/s]


In [None]:
#token_type_idsの作成
token_type_ids=np.zeros(test_attention_masks.shape)
token_type_ids

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [None]:
#テストデータの作成
x_test=[]
x_test.append(test_input_ids)
x_test.append(test_attention_masks)
x_test.append(token_type_ids)

print(x_test[0].shape)

(153164, 128)


In [None]:
#予測
y_pred = model.predict(x_test, batch_size=32,verbose=1)
df_submission[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]] = y_pred
#df_submission.to_csv('submission1.csv', index=False)



In [None]:
#結果の格納
df_submission.to_csv('submission1.csv', index=True)
df_submission

Unnamed: 0_level_0,toxic,severe_toxic,obscene,threat,insult,identity_hate
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
00001cee341fdb12,0.989021,0.349818,0.938313,0.099160,0.869715,0.610838
0000247867823ef7,0.001209,0.000051,0.000169,0.000048,0.000084,0.000052
00013b17ad220c46,0.000665,0.000064,0.000125,0.000055,0.000046,0.000075
00017563c3f7919a,0.000174,0.000065,0.000146,0.000097,0.000046,0.000088
00017695ad8997eb,0.000697,0.000052,0.000162,0.000060,0.000056,0.000040
...,...,...,...,...,...,...
fffcd0960ee309b5,0.482821,0.001680,0.160274,0.000841,0.029287,0.002017
fffd7a9a6eb32c16,0.008160,0.000078,0.000828,0.000084,0.000378,0.000111
fffda9e8d6fafa9e,0.000345,0.000052,0.000154,0.000046,0.000048,0.000049
fffe8f1340a79fc2,0.000971,0.000069,0.000158,0.000053,0.000096,0.000234
