

# 0. Import libraries



In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

from keras.models import Model
from keras.layers import Input, Dense, Embedding, SpatialDropout1D, concatenate
from keras.preprocessing import text, sequence
from keras.callbacks import Callback

import warnings
warnings.filterwarnings('ignore')

import os
os.environ['OMP_NUM_THREADS'] = '8'

# 1. Load datas & set variables

In [5]:
NLP_ENG_DIR = '/content/drive/MyDrive/NLP/ENG/'
GLOVE_DIR = NLP_ENG_DIR + 'wordvector/glove/'
BASE_DIR = NLP_ENG_DIR + 'Jigsaw1/'
DATA_DIR = BASE_DIR + 'input/'
OUTPUT_DIR = BASE_DIR + 'output/'
MODEL_DIR = BASE_DIR + 'model/'

EMBEDDING_FILE = GLOVE_DIR + 'glove.42B.300d.txt'

train = pd.read_csv(DATA_DIR + 'train.csv')
test = pd.read_csv(DATA_DIR + 'test.csv')
submission = pd.read_csv(DATA_DIR + 'sample_submission.csv')

X_train = train.comment_text.fillna('fillna').values
y_train = train[["toxic", "severe_toxic", "obscene",
                 "threat", "insult", "identity_hate"]].values
X_test = test.comment_text.fillna('fillna').values

# 2. Tokenize

## 2.1 fitting on train & test texts

In [6]:
MAX_FEATURES = 30000
MAX_LEN = 100
EMBED_SIZE = 300

tokenizer = text.Tokenizer(num_words=MAX_FEATURES)
tokenizer.fit_on_texts(list(X_train) + list(X_test))

## 2.2 Texts to sequences

In [7]:
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

In [18]:
x_train = sequence.pad_sequences(X_train, maxlen=MAX_LEN)
x_test = sequence.pad_sequences(X_test, maxlen=MAX_LEN)

# 3. Get embedding from glove.42b.300d

In [19]:
def get_coefs(word, *arr):
  return word, np.asarray(arr, dtype='float32')

In [23]:
# {단어: 벡터}
embedding_index = dict(get_coefs(*o.strip().split()) for o in open(EMBEDDING_FILE))

In [25]:
# 벡터들만 추출
all_embs = np.stack(embedding_index.values())

In [27]:
emb_mean, emb_std = all_embs.mean(), all_embs.std()

In [29]:
# tokenizer에 등록된 {단어:단어idx}
word_index = tokenizer.word_index
nb_words = min(MAX_FEATURES, len(word_index))
# Glove.42b.300d에 없는 단어는 정규분포 따른 랜덤값을 준다.
embedding_matrix = np.random.normal(emb_mean, emb_std,
                                    (nb_words, EMBED_SIZE))

In [31]:
for word, i in word_index.items():
  if i >= MAX_FEATURES: continue
  embedding_vector = embedding_index.get(word)
  if embedding_vector is not None:
    embedding_matrix[i] = embedding_vector

# 4. RocAucEvaluation

In [32]:
class RocAucEvaluation(Callback):
  def __init__(self, validation_data=(), interval=1):
    super(Callback, self).__init__()

    self.interval = interval
    self.X_val, self.y_val = validation_data

  def on_epoch_end(self, epoch, logs={}):
    if epoch % self.interval == 0:
      y_pred = self.model.predict(self.X_val, verbose=0)
      score = roc_auc_score(self.y_val, y_pred)
      print(f'\n ROC-AUC - epoch: {epoch+1} - score: {score:.6f}\n')

# 5. Get_model

In [33]:
from keras.layers import GRU, Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D

def get_model():
  # (100, ) -> (300, 30000)
  inp = Input(shape=(MAX_LEN, ))
  x = Embedding(MAX_FEATURES, EMBED_SIZE,
                weights=[embedding_matrix],
                trainable=True)(inp)
  x = SpatialDropout1D(.2)(x)
  x = Bidirectional(GRU(80, return_sequences=True))(x)
  avg_pool = GlobalAveragePooling1D()(x)
  max_pool = GlobalMaxPooling1D()(x)
  conc = concatenate([avg_pool, max_pool])
  outp =Dense(6, activation='sigmoid')(conc)

  model = Model(inputs=inp, outputs=outp)
  model.compile(loss='binary_crossentropy',
                optimizer='adam',
                metrics=['accuracy'])
  return model

# 6. Train

In [34]:
model = get_model()

batch_size = 32
epochs = 3

X_tra, X_val, y_tra, y_val = train_test_split(x_train, y_train,
                                              train_size=.95,
                                              random_state=233)
RocAuc = RocAucEvaluation(validation_data=(X_val, y_val), interval=1)

hist = model.fit(X_tra, y_tra, batch_size=batch_size, epochs=epochs,
                 validation_data=(X_val, y_val),
                 callbacks=[RocAuc], verbose=2)

y_pred = model.predict(x_test, batch_size=1024)
submission[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]] = y_pred

Epoch 1/3

 ROC-AUC - epoch: 1 - score: 0.986018

4738/4738 - 75s - loss: 0.0493 - accuracy: 0.9439 - val_loss: 0.0449 - val_accuracy: 0.9652 - 75s/epoch - 16ms/step
Epoch 2/3

 ROC-AUC - epoch: 2 - score: 0.986669

4738/4738 - 62s - loss: 0.0380 - accuracy: 0.9263 - val_loss: 0.0456 - val_accuracy: 0.9909 - 62s/epoch - 13ms/step
Epoch 3/3

 ROC-AUC - epoch: 3 - score: 0.987083

4738/4738 - 61s - loss: 0.0322 - accuracy: 0.8259 - val_loss: 0.0467 - val_accuracy: 0.9287 - 61s/epoch - 13ms/step


In [35]:
submission.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.998705,0.5292692,0.980239,0.105331,0.951809,0.2212902
1,0000247867823ef7,8.7e-05,4.84519e-07,7e-06,2.013918e-07,9e-06,5.532251e-07
2,00013b17ad220c46,0.000836,2.194141e-05,0.000398,3.800107e-05,0.000184,4.477575e-06
3,00017563c3f7919a,0.00017,2.622429e-06,4.1e-05,1.190195e-05,0.000103,1.162324e-06
4,00017695ad8997eb,0.002068,2.417166e-05,0.000129,3.748747e-05,0.000152,4.520753e-06


In [41]:
submission.to_csv(OUTPUT_DIR + 'jigsaw1-02-pooled-gru+glove-trainable-3epochs.csv', index=False)

In [42]:
!kaggle competitions submit -c jigsaw-toxic-comment-classification-challenge -f /content/drive/MyDrive/NLP/ENG/Jigsaw1/output/jigsaw1-02-pooled-gru+glove-trainable-3epochs.csv -m "pooled gru + trained glove"

100% 13.8M/13.8M [00:03<00:00, 3.67MB/s]
Successfully submitted to Toxic Comment Classification Challenge

In [38]:
from google.colab import files
files.upload()

!mkdir -p ~/.kaggle//
!cp kaggle.json ~/.kaggle/
# Permission Warning이 발생하지 않도록 해줍니다.
!chmod 600 ~/.kaggle/kaggle.json

Saving kaggle.json to kaggle.json


In [40]:
submission.columns

Index(['id', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult',
       'identity_hate'],
      dtype='object')