In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# from google.colab import files
# files.upload()

# !mkdir -p ~/.kaggle//
# !cp kaggle.json ~/.kaggle/
# !chmod 600 ~/.kaggle/kaggle.json

Mounted at /content/drive


# 0. Import & Utility functions

In [2]:
import numpy as np
import pandas as pd
import os
import time
import gc
import random
from tqdm._tqdm_notebook import tqdm_notebook as tqdm
from keras.preprocessing import text, sequence
import torch
from torch import nn
from torch.utils import data
from torch.nn import functional as F

Please use `tqdm.notebook.*` instead of `tqdm._tqdm_notebook.*`
  import sys


In [3]:
from drive.MyDrive.NLP.ENG.Jigsaw2.moong_util import BASE_DIR, WV_DIR, MODEL_DIR, DATA_DIR, OUTPUT_DIR
from drive.MyDrive.NLP.ENG.Jigsaw2.moong_util import seed_everything

from drive.MyDrive.NLP.ENG.Jigsaw2.moong_embedding import build_matrix

from drive.MyDrive.NLP.ENG.Jigsaw2.moong_lstm import JigsawNeuralNetV1 as NeuralNet

from drive.MyDrive.NLP.ENG.Jigsaw2.moong_preprocessing import preprocess_v1 as preprocess

from drive.MyDrive.NLP.ENG.Jigsaw2.moong_tokenize import tokenize_v1 as tokenize

from drive.MyDrive.NLP.ENG.Jigsaw2.moong_train import train_model_v1 as train_model

In [4]:
CRAWL_EMBEDDING_PATH = WV_DIR + 'fasttext/crawl-300d-2M.vec'
GLOVE_EMBEDDING_PATH = WV_DIR + 'glove/glove.840B.300d.txt'
NUM_MODELS = 2
LSTM_UNITS = 128
DENSE_HIDDEN_UNITS = 4 * LSTM_UNITS
MAX_LEN = 220

seed_everything()

# 1. Preprocessing

In [5]:
# 1. Load data
train = pd.read_csv(DATA_DIR + 'train.csv')
test = pd.read_csv(DATA_DIR + 'test.csv')

x_train = preprocess(train['comment_text'])
y_train = np.where(train['target'] >= 0.5, 1, 0)
y_aux_train = train[['target', 'severe_toxicity', 'obscene', 'identity_attack', 'insult', 'threat']]
x_test = preprocess(test['comment_text'])

# 2. Tokenize & text to pdd sequences
max_features = None

x_train_pdd_sequences, x_test_pdd_sequences, word_index = tokenize(x_train, x_test, MAX_LEN)

max_features = max_features or len(word_index) + 1
max_features

del x_train, x_test
gc.collect()

# 3. Word embedding
crawl_matrix, unknown_words_crawl = build_matrix(word_index, CRAWL_EMBEDDING_PATH)
print('n unknown words (crawl): ', len(unknown_words_crawl))

glove_matrix, unknown_words_glove = build_matrix(word_index, GLOVE_EMBEDDING_PATH)
print('n unknown words (glove): ', len(unknown_words_glove))

embedding_matrix = np.concatenate([crawl_matrix, glove_matrix], axis=-1)
print(f'embedding_matrix.shape = {embedding_matrix.shape}')

del crawl_matrix
del glove_matrix
gc.collect()

# 4. sequence to tensor & load to cuda
x_train_torch = torch.tensor(x_train_pdd_sequences, dtype=torch.long).cuda()
x_test_torch = torch.tensor(x_test_pdd_sequences, dtype=torch.long).cuda()
y_train_torch = torch.tensor(np.hstack([y_train[:, np.newaxis], y_aux_train]),
                             dtype=torch.float32).cuda()

0it [00:00, ?it/s]

n unknown words (crawl):  173678


0it [00:00, ?it/s]

n unknown words (glove):  170383
embedding_matrix.shape = (327009, 600)


# 2. Training

In [7]:
train_dataset = data.TensorDataset(x_train_torch, y_train_torch)
test_dataset = data.TensorDataset(x_test_torch)

all_test_preds = []

for model_idx in range(NUM_MODELS):
    print('Model ', model_idx)
    seed_everything(1234 + model_idx)
    
    model = NeuralNet(LSTM_UNITS, DENSE_HIDDEN_UNITS,
                      embedding_matrix, y_aux_train.shape[-1],
                      max_features=max_features)
    model.cuda()
    
    test_preds = train_model(model, train_dataset, test_dataset, output_dim=y_train_torch.shape[-1], 
                             loss_fn=nn.BCEWithLogitsLoss(reduction='mean'))
    all_test_preds.append(test_preds)
    print()

Model  0




  0%|          | 0/3526 [00:00<?, ?it/s]

Epoch 1/4 	 loss=0.1110 	 time=649.98s


  0%|          | 0/3526 [00:00<?, ?it/s]

Epoch 2/4 	 loss=0.1041 	 time=649.45s


  0%|          | 0/3526 [00:00<?, ?it/s]

Epoch 3/4 	 loss=0.1028 	 time=651.26s


  0%|          | 0/3526 [00:00<?, ?it/s]

Epoch 4/4 	 loss=0.1020 	 time=654.23s

Model  1


  0%|          | 0/3526 [00:00<?, ?it/s]

Epoch 1/4 	 loss=0.1111 	 time=652.75s


  0%|          | 0/3526 [00:00<?, ?it/s]

Epoch 2/4 	 loss=0.1041 	 time=652.88s


  0%|          | 0/3526 [00:00<?, ?it/s]

Epoch 3/4 	 loss=0.1028 	 time=651.20s


  0%|          | 0/3526 [00:00<?, ?it/s]

Epoch 4/4 	 loss=0.1020 	 time=649.02s



In [8]:
submission = pd.DataFrame.from_dict({
    'id': test['id'],
    'prediction': np.mean(all_test_preds, axis=0)[:, 0]
})

submission.to_csv(OUTPUT_DIR + 'jigsaw2-01-lstm_base.csv', index=False)