In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


# 0. Imports & Utility functions

In [2]:
from fastai.train import Learner
from fastai.train import DataBunch
from fastai.callbacks import GeneralScheduler, TrainingPhase
from fastai.basic_data import DatasetType

import numpy as np
import pandas as pd
import os
import time
import gc
import random
from tqdm._tqdm_notebook import tqdm_notebook as tqdm
from keras.preprocessing import text, sequence
import torch
from torch import nn
from torch.utils import data
from torch.nn import functional as F

Please use `tqdm.notebook.*` instead of `tqdm._tqdm_notebook.*`
  if sys.path[0] == '':


In [3]:
import sys
sys.path.insert(0, 'drive/MyDrive/NLP/ENG/Jigsaw2/')

from moong_util import BASE_DIR, WV_DIR, MODEL_DIR, DATA_DIR, OUTPUT_DIR
from moong_util import seed_everything
from moong_embedding import build_matrix
from moong_lstm import JigsawNeuralNetV1 as NeuralNet
from moong_preprocessing import preprocess_v1 as preprocess
from moong_tokenize import tokenize_v1 as tokenize
from moong_train import train_model_v2 as train_model

In [4]:
CRAWL_EMBEDDING_PATH = WV_DIR + 'fasttext/crawl-300d-2M.vec'
GLOVE_EMBEDDING_PATH = WV_DIR + 'glove/glove.840B.300d.txt'
NUM_MODELS = 2
LSTM_UNITS = 128
DENSE_HIDDEN_UNITS = 4 * LSTM_UNITS
MAX_LEN = 220
NROWS = 10000
BATCH_SIZE = 128

seed_everything()

# 1. Preprocessing

## 1.1 Load data & get weights

In [5]:
train = pd.read_csv(DATA_DIR + 'train.csv', nrows=NROWS)
test = pd.read_csv(DATA_DIR + 'test.csv', nrows=NROWS)

x_train = preprocess(train['comment_text'])
y_aux_train = train[['target', 'severe_toxicity', 'obscene', 'identity_attack', 'insult', 'threat']]
x_test = preprocess(test['comment_text'])

identity_columns = [
    'male', 'female', 'homosexual_gay_or_lesbian', 'christian', 'jewish',
    'muslim', 'black', 'white', 'psychiatric_or_mental_illness']

# Overall
weights = np.ones((len(x_train),)) / 4

# Subgroup
weights += (train[identity_columns].fillna(0).values>=0.5).sum(axis=1).astype(bool).astype(int) / 4

# Background Positive, Subgroup Negative
weights += (( (train['target'].values>=0.5).astype(bool).astype(int) +
   (train[identity_columns].fillna(0).values<0.5).sum(axis=1).astype(bool).astype(int) ) > 1 ).astype(bool).astype(int) / 4

# Background Negative, Subgroup Positive
weights += (( (train['target'].values<0.5).astype(bool).astype(int) +
   (train[identity_columns].fillna(0).values>=0.5).sum(axis=1).astype(bool).astype(int) ) > 1 ).astype(bool).astype(int) / 4

loss_weight = 1.0 / weights.mean()

y_train = np.vstack([(train.target.values>=.5).astype(int), weights]).T

## 1.2 Tokenize

In [6]:
max_features = None

x_train_pdd_sequences, x_test_pdd_sequences, word_index = tokenize(x_train, x_test, MAX_LEN)

max_features = max_features or len(word_index) + 1
max_features

del x_train, x_test
gc.collect()

22

## 1.3 Word embedding

In [7]:
crawl_matrix, unknown_words_crawl = build_matrix(word_index, CRAWL_EMBEDDING_PATH)
print('n unknown words (crawl): ', len(unknown_words_crawl))

glove_matrix, unknown_words_glove = build_matrix(word_index, GLOVE_EMBEDDING_PATH)
print('n unknown words (glove): ', len(unknown_words_glove))

embedding_matrix = np.concatenate([crawl_matrix, glove_matrix], axis=-1)
print(f'embedding_matrix.shape = {embedding_matrix.shape}')

del crawl_matrix
del glove_matrix
gc.collect()

0it [00:00, ?it/s]

n unknown words (crawl):  3823


0it [00:00, ?it/s]

n unknown words (glove):  3566
embedding_matrix.shape = (37284, 600)


42

## 1.4 Sequence to tensor on CUDA

In [8]:
x_train_torch = torch.tensor(x_train_pdd_sequences, dtype=torch.long)
y_train_torch = torch.tensor(np.hstack([y_train, y_aux_train]), dtype=torch.float32)
x_test_torch = torch.tensor(x_test_pdd_sequences, dtype=torch.long)

# 2. Training

In [9]:
train_dataset = data.TensorDataset(x_train_torch, y_train_torch)
valid_dataset = data.TensorDataset(x_train_torch[:BATCH_SIZE], y_train_torch[:BATCH_SIZE])
test_dataset = data.TensorDataset(x_test_torch)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=False)

databunch = DataBunch(train_dl=train_loader, valid_dl=valid_loader)

In [10]:
def custom_loss(data, targets):
    ''' Define custom loss function for weighted BCE on 'target' column '''
    bce_loss_1 = nn.BCEWithLogitsLoss(weight=targets[:,1:2])(data[:,:1],targets[:,:1])
    bce_loss_2 = nn.BCEWithLogitsLoss()(data[:,1:],targets[:,2:])
    return (bce_loss_1 * loss_weight) + bce_loss_2

In [11]:
all_test_preds = []

for model_idx in range(NUM_MODELS):
    print('Model ', model_idx)
    seed_everything(1234 + model_idx)
    model = NeuralNet(LSTM_UNITS, DENSE_HIDDEN_UNITS, embedding_matrix, y_aux_train.shape[-1],
                      max_features=max_features)
    learn = Learner(databunch,model,loss_func=custom_loss)
    test_preds = train_model(learn, test_dataset, output_dim=7, batch_size=BATCH_SIZE)    
    all_test_preds.append(test_preds)

Model  0


epoch,train_loss,valid_loss,time
0,0.460189,0.274603,00:05


epoch,train_loss,valid_loss,time
0,0.294675,0.262695,00:05


epoch,train_loss,valid_loss,time
0,0.278403,0.225216,00:05


epoch,train_loss,valid_loss,time
0,0.245456,0.221054,00:05


Model  1


epoch,train_loss,valid_loss,time
0,0.434657,0.300164,00:05


epoch,train_loss,valid_loss,time
0,0.298324,0.234371,00:05


epoch,train_loss,valid_loss,time
0,0.27951,0.248167,00:05


epoch,train_loss,valid_loss,time
0,0.256301,0.216653,00:05
