# 0. Import libraries

In [2]:
import numpy as np
import pandas as pd
import os
from keras.models import Model
from keras.layers import Input, Dense, Embedding, SpatialDropout1D, add, concatenate
from keras.layers import CuDNNLSTM, Bidirectional, GlobalMaxPooling1D, GlobalAveragePooling1D, CuDNNGRU, Conv1D
from keras.preprocessing import text, sequence
from keras.callbacks import LearningRateScheduler
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import tensorflow as tf
print(tf.__version__)
tf.test.is_gpu_available(
    cuda_only=False,
    min_cuda_compute_capability=None
)

2.8.2
Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.


True

# 1. Load data

In [3]:
BASE_DIR = '/content/drive/MyDrive/NLP/ENG/Jigsaw1/'
WV_DIR = '/content/drive/MyDrive/NLP/ENG/wordvector/'
MODEL_DIR = BASE_DIR + 'model/'
DATA_DIR = BASE_DIR + 'input/'
OUTPUT_DIR = BASE_DIR + 'output/'

EMBEDDING_FILES = [
                   WV_DIR + 'fasttext/crawl-300d-2M.vec',
                   WV_DIR + 'glove/glove.840B.300d.txt']
TEXT_COLUMN = 'comment_text'
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
CHARS_TO_REMOVE = '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n“”’\'∞θ÷α•à−β∅³π‘₹´°£€\×™√²—'

In [4]:
train_df = pd.read_csv(DATA_DIR + 'train.csv')
test_df = pd.read_csv(DATA_DIR + 'test.csv')
submission = pd.read_csv(DATA_DIR + 'sample_submission.csv')

In [5]:
y = train_df[list_classes].values
x_train = train_df[TEXT_COLUMN].astype(str)
y_train = y
x_test = test_df[TEXT_COLUMN].astype(str)

# 2. Embedding

In [6]:
def get_coefs(word, *arr):
    return word, np.asarray(arr, dtype='float32')

In [7]:
def load_embedding(path):
  with open(path) as f:
    return dict(get_coefs(*line.strip().split(' ')) for line in f)

In [8]:
def build_matrix(word_index, path):
  embedding_index = load_embedding(path)
  embedding_matrix = np.zeros((len(word_index)+1, 300))
  for word, i in word_index.items():
    try: 
      embedding_matrix[i] = embedding_index[word]
    except KeyError:
      pass
  return embedding_matrix

In [9]:
%%time
tokenizer = text.Tokenizer(filters=CHARS_TO_REMOVE)
tokenizer.fit_on_texts(list(x_train) + list(x_test))

MAX_LEN = 220

x_train = tokenizer.texts_to_sequences(x_train)
x_test = tokenizer.texts_to_sequences(x_test)
x_train = sequence.pad_sequences(x_train, maxlen=MAX_LEN)
x_test = sequence.pad_sequences(x_test, maxlen=MAX_LEN)

CPU times: user 37.5 s, sys: 469 ms, total: 38 s
Wall time: 37.7 s


In [15]:
%%time
embedding_matrix = np.concatenate(
    [build_matrix(tokenizer.word_index, f) for f in EMBEDDING_FILES], axis=1
)

CPU times: user 3min 22s, sys: 20.6 s, total: 3min 42s
Wall time: 3min 41s


In [16]:
embedding_matrix.shape

(358905, 600)

# 3. Modeling

In [20]:
BATCH_SIZE = 512
LSTM_UNITS = 128
DENSE_HIDDEN_UNITS = 4 * LSTM_UNITS
EPOCHS = 4

def build_model(embedding_matrix):
  words = Input(shape=(None, ))
  x = Embedding(*embedding_matrix.shape, weights=[embedding_matrix], trainable=False)(words)
  # x = SpatialDropout1D(.2)(x)

  x1 = SpatialDropout1D(.2)(x)

  x = Bidirectional(CuDNNGRU(LSTM_UNITS, return_sequences = True))(x1)
  x = Conv1D(64, kernel_size = 2, padding = "valid", kernel_initializer = "he_uniform")(x)
  
  y = Bidirectional(CuDNNLSTM(LSTM_UNITS, return_sequences = True))(x1)
  y = Conv1D(64, kernel_size = 2, padding = "valid", kernel_initializer = "he_uniform")(y)

  avg_pool1 = GlobalAveragePooling1D()(x)
  max_pool1 = GlobalMaxPooling1D()(x)
  
  avg_pool2 = GlobalAveragePooling1D()(y)
  max_pool2 = GlobalMaxPooling1D()(y)

  x = concatenate([avg_pool1, max_pool1, avg_pool2, max_pool2])

  x = Dense(6, activation='sigmoid')(x)

  model = Model(inputs=words, outputs=x)

  # model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

  return model

In [None]:
X_train, X_valid, Y_train ,Y_valid = train_test_split(x_train, y_train, test_size=.1)

In [None]:
EPOCHS = 5
SEEDS = 10

pred = 0

for ii in range(SEEDS):
  model = build_model(embedding_matrix)
  for global_epoch in range(EPOCHS):
    print(global_epoch)
    model.fit(
        X_train,
        Y_train,
        validation_data=(X_valid, Y_valid),
        batch_size=128,
        epochs=1,
        verbose=2,
        callbacks=[
                   LearningRateScheduler(lambda _: 1e-3*(0.55 ** global_epoch))
        ]
    )
    val_preds_3 = model.predict(X_valid)
    AUC = 0
    for i in range(6):
      AUC += roc_auc_score(Y_valid[:, i], val_preds_3[:, i])/6.
    print(AUC)
  pred += model.predict(x_test, batch_size=1024, verbose=1)/SEEDS
  model.save_weights(MODEL_DIR + 'bilstm_gru_dual_embedding_' + str(ii) + '.h5')
  os.system('gzip' + 'model_weights_' + str(ii) + '.h5')

0
1122/1122 - 151s - loss: 0.0514 - accuracy: 0.9339 - val_loss: 0.0404 - val_accuracy: 0.9761 - lr: 0.0010 - 151s/epoch - 134ms/step
0.9900521542599259
1
1122/1122 - 133s - loss: 0.0387 - accuracy: 0.9756 - val_loss: 0.0381 - val_accuracy: 0.9898 - lr: 5.5000e-04 - 133s/epoch - 118ms/step
0.9908076803704849
2
1122/1122 - 134s - loss: 0.0354 - accuracy: 0.9729 - val_loss: 0.0373 - val_accuracy: 0.9620 - lr: 3.0250e-04 - 134s/epoch - 119ms/step
0.9909609647791054
3
1122/1122 - 134s - loss: 0.0331 - accuracy: 0.9687 - val_loss: 0.0367 - val_accuracy: 0.9647 - lr: 1.6638e-04 - 134s/epoch - 120ms/step
0.9911299548682785
4
1122/1122 - 133s - loss: 0.0314 - accuracy: 0.9677 - val_loss: 0.0369 - val_accuracy: 0.9641 - lr: 9.1506e-05 - 133s/epoch - 119ms/step
0.9912073187179473
0
1122/1122 - 137s - loss: 0.0513 - accuracy: 0.9045 - val_loss: 0.0401 - val_accuracy: 0.9874 - lr: 0.0010 - 137s/epoch - 122ms/step
0.989048021050676
1
1122/1122 - 133s - loss: 0.0389 - accuracy: 0.9459 - val_loss: 0.

# 4. Load Model

In [30]:
pred = 0

model = build_model(embedding_matrix)

for i in range(10):
  model.load_weights(MODEL_DIR + f'bilstm_gru_dual_embedding_{i}.h5')
  pred += model.predict(x_test, batch_size=1024, verbose=1)/10



In [32]:
submission[list_classes] = (pred)
submission

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.997470,0.403298,0.974170,0.169461,0.928706,0.481467
1,0000247867823ef7,0.000269,0.000010,0.000072,0.000004,0.000054,0.000007
2,00013b17ad220c46,0.000483,0.000089,0.000262,0.000053,0.000190,0.000073
3,00017563c3f7919a,0.000097,0.000003,0.000038,0.000018,0.000038,0.000002
4,00017695ad8997eb,0.004352,0.000095,0.000424,0.000040,0.000250,0.000020
...,...,...,...,...,...,...,...
153159,fffcd0960ee309b5,0.673565,0.000582,0.278641,0.000039,0.054910,0.000608
153160,fffd7a9a6eb32c16,0.006050,0.000073,0.000364,0.000878,0.001206,0.000105
153161,fffda9e8d6fafa9e,0.000330,0.000005,0.000132,0.000002,0.000051,0.000014
153162,fffe8f1340a79fc2,0.000896,0.000017,0.000079,0.000030,0.000116,0.001346


In [33]:
submission.to_csv(OUTPUT_DIR + "bi_lstm_gru_dual_embedding.csv", index = False)

In [36]:
!kaggle competitions submit -c jigsaw-toxic-comment-classification-challenge -f /content/drive/MyDrive/NLP/ENG/Jigsaw1/output/bi_lstm_gru_dual_embedding.csv -m "bi_lstm_gru_dual_embedding"

100% 13.8M/13.8M [00:01<00:00, 9.49MB/s]
Successfully submitted to Toxic Comment Classification Challenge

In [35]:
from google.colab import files
files.upload()

!mkdir -p ~/.kaggle//
!cp kaggle.json ~/.kaggle/
# Permission Warning이 발생하지 않도록 해줍니다.
!chmod 600 ~/.kaggle/kaggle.json

Saving kaggle.json to kaggle.json
