

# 0. Import libraries



In [None]:
import numpy as np
np.random.seed(42)
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

from keras.models import Model
from keras.layers import Input, Dense, Embedding, SpatialDropout1D, concatenate
from keras.preprocessing import text, sequence
from keras.callbacks import Callback

import warnings
warnings.filterwarnings('ignore')

import os
os.environ['OMP_NUM_THREADS'] = '8'

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# 1. Load datas & set variables

In [None]:
NLP_ENG_DIR = '/content/drive/MyDrive/NLP/ENG/'
GLOVE_DIR = NLP_ENG_DIR + 'wordvector/glove/'
BASE_DIR = NLP_ENG_DIR + 'Jigsaw1/'
DATA_DIR = BASE_DIR + 'input/'
OUTPUT_DIR = BASE_DIR + 'output/'
MODEL_DIR = BASE_DIR + 'model/'

EMBEDDING_FILE = GLOVE_DIR + 'glove.42B.300d.txt'

train = pd.read_csv(DATA_DIR + 'train.csv')
test = pd.read_csv(DATA_DIR + 'test.csv')
submission = pd.read_csv(DATA_DIR + 'sample_submission.csv')

X_train = train.comment_text.fillna('fillna').values
y_train = train[["toxic", "severe_toxic", "obscene",
                 "threat", "insult", "identity_hate"]].values
X_test = test.comment_text.fillna('fillna').values

# 2. Tokenize

## 2.1 fitting on train & test texts

In [None]:
MAX_FEATURES = 30000
MAX_LEN = 100
EMBED_SIZE = 300

tokenizer = text.Tokenizer(num_words=MAX_FEATURES)
tokenizer.fit_on_texts(list(X_train) + list(X_test))

## 2.2 Texts to sequences

In [None]:
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

x_train = sequence.pad_sequences(X_train, maxlen=MAX_LEN)
x_test = sequence.pad_sequences(X_test, maxlen=MAX_LEN)

# 3. Get embedding from glove.42b.300d

In [None]:
def get_coefs(word, *arr):
  return word, np.asarray(arr, dtype='float32')
# {단어: 벡터}
embedding_index = dict(get_coefs(*o.strip().split()) for o in open(EMBEDDING_FILE))

# 벡터들만 추출
all_embs = np.stack(embedding_index.values())
emb_mean, emb_std = all_embs.mean(), all_embs.std()

# tokenizer에 등록된 {단어:단어idx}
word_index = tokenizer.word_index
nb_words = min(MAX_FEATURES, len(word_index))

# Glove.42b.300d에 없는 단어는 정규분포 따른 랜덤값을 준다.
embedding_matrix = np.random.normal(emb_mean, emb_std,
                                    (nb_words, EMBED_SIZE))

for word, i in word_index.items():
  if i >= MAX_FEATURES: continue
  embedding_vector = embedding_index.get(word)
  if embedding_vector is not None:
    embedding_matrix[i] = embedding_vector

# 4. RocAucEvaluation

In [None]:
class RocAucEvaluation(Callback):
  def __init__(self, validation_data=(), interval=1):
    super(Callback, self).__init__()

    self.interval = interval
    self.X_val, self.y_val = validation_data

  def on_epoch_end(self, epoch, logs={}):
    if epoch % self.interval == 0:
      y_pred = self.model.predict(self.X_val, verbose=0)
      score = roc_auc_score(self.y_val, y_pred)
      print(f'\n ROC-AUC - epoch: {epoch+1} - score: {score:.6f}\n')

# 5. Get_model

In [None]:
from keras.layers import GRU, Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D

def get_model():
  # (100, ) -> (300, 30000)
  inp = Input(shape=(MAX_LEN, ))
  x = Embedding(MAX_FEATURES, EMBED_SIZE,
                weights=[embedding_matrix])(inp) # trainable 제거
  x = SpatialDropout1D(.2)(x)
  x = Bidirectional(GRU(80, return_sequences=True))(x)
  avg_pool = GlobalAveragePooling1D()(x)
  max_pool = GlobalMaxPooling1D()(x)
  conc = concatenate([avg_pool, max_pool])
  outp =Dense(6, activation='sigmoid')(conc)

  model = Model(inputs=inp, outputs=outp)
  model.compile(loss='binary_crossentropy',
                optimizer='adam',
                metrics=['accuracy'])
  return model

# 6. Train & predict

In [None]:
model = get_model()

batch_size = 32
epochs = 3

X_tra, X_val, y_tra, y_val = train_test_split(x_train, y_train,
                                              train_size=.95,
                                              random_state=233)
RocAuc = RocAucEvaluation(validation_data=(X_val, y_val), interval=1)

hist = model.fit(X_tra, y_tra, batch_size=batch_size, epochs=epochs,
                 validation_data=(X_val, y_val),
                 callbacks=[RocAuc], verbose=2)

y_pred = model.predict(x_test, batch_size=1024)
submission[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]] = y_pred

Epoch 1/3

 ROC-AUC - epoch: 1 - score: 0.986735

4738/4738 - 67s - loss: 0.0493 - accuracy: 0.9609 - val_loss: 0.0458 - val_accuracy: 0.9726 - 67s/epoch - 14ms/step
Epoch 2/3

 ROC-AUC - epoch: 2 - score: 0.987782

4738/4738 - 57s - loss: 0.0377 - accuracy: 0.9129 - val_loss: 0.0444 - val_accuracy: 0.9925 - 57s/epoch - 12ms/step
Epoch 3/3

 ROC-AUC - epoch: 3 - score: 0.987210

4738/4738 - 57s - loss: 0.0320 - accuracy: 0.8419 - val_loss: 0.0464 - val_accuracy: 0.8806 - 57s/epoch - 12ms/step


In [11]:
submission.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.9994,0.395453,0.980261,0.02116158,0.958735,0.143918
1,0000247867823ef7,5e-05,2e-06,2e-05,7.859232e-07,2.4e-05,2e-06
2,00013b17ad220c46,0.000178,1.3e-05,0.000371,3.124946e-06,0.000129,1.6e-05
3,00017563c3f7919a,0.000281,2e-05,0.000213,9.552385e-06,0.000571,5e-06
4,00017695ad8997eb,0.00036,3.1e-05,0.000185,2.270791e-05,7.7e-05,5e-06


In [12]:
submission.to_csv(OUTPUT_DIR + 'jigsaw1-04-pooled-gru+glove-3epochs.csv', index=False)

In [13]:
from google.colab import files
files.upload()

!mkdir -p ~/.kaggle//
!cp kaggle.json ~/.kaggle/
# Permission Warning이 발생하지 않도록 해줍니다.
!chmod 600 ~/.kaggle/kaggle.json

Saving kaggle.json to kaggle.json


In [14]:
!kaggle competitions submit -c jigsaw-toxic-comment-classification-challenge -f /content/drive/MyDrive/NLP/ENG/Jigsaw1/output/jigsaw1-04-pooled-gru+glove-3epochs.csv -m "pooled gru + glove, trainable=False"

100% 13.8M/13.8M [00:01<00:00, 8.01MB/s]
Successfully submitted to Toxic Comment Classification Challenge