In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


# 0. Import & Utility functions

In [2]:
import os
import re
import gc
import sys
import time
import json
import random
import unicodedata
import multiprocessing
from functools import partial, lru_cache

!pip install emoji
import emoji

import numpy as np
import pandas as pd
import joblib
from tqdm import tqdm, tqdm_notebook

from nltk import TweetTokenizer
from nltk.stem import PorterStemmer, SnowballStemmer
from nltk.stem.lancaster import LancasterStemmer

import torch
from torch import nn
from torch.utils import data
from torch.nn import functional as F
from gensim.models import KeyedVectors
from keras.preprocessing import text, sequence
from keras.preprocessing.sequence import pad_sequences

from fastai.train import Learner
from fastai.train import DataBunch
from fastai.callbacks import GeneralScheduler, TrainingPhase
from fastai.basic_data import DatasetType

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting emoji
  Downloading emoji-1.7.0.tar.gz (175 kB)
[K     |████████████████████████████████| 175 kB 12.1 MB/s 
[?25hBuilding wheels for collected packages: emoji
  Building wheel for emoji (setup.py) ... [?25l[?25hdone
  Created wheel for emoji: filename=emoji-1.7.0-py3-none-any.whl size=171046 sha256=f02327b0b6462bf44bd462ab817278d27be067223f93ce2c48a664c034832891
  Stored in directory: /root/.cache/pip/wheels/8a/4e/b6/57b01db010d17ef6ea9b40300af725ef3e210cb1acfb7ac8b6
Successfully built emoji
Installing collected packages: emoji
Successfully installed emoji-1.7.0


In [4]:
import sys
sys.path.insert(0, 'drive/MyDrive/NLP/ENG/Jigsaw2/')

from moong_util import BASE_DIR, WV_DIR, MODEL_DIR, DATA_DIR, OUTPUT_DIR
from moong_util import seed_everything
from moong_embedding import gensim_to_embedding_matrix
# from moong_lstm import JigsawNeuralNetV2 as NeuralNet
from moong_preprocessing import preprocess_v2 as preprocess
from moong_tokenize import tokenize_v3 as tokenize
from moong_train import train_model_v2 as train_model
from moong_collator import SequenceBucketCollator

In [5]:
# GLOVE_EMBEDDING_PATH = WV_DIR + 'gensim/glove.840B.300d.gensim'
# CRAWL_EMBEDDING_PATH = WV_DIR + 'gensim/crawl-300d-2M.gensim'
# PARA_EMBEDDING_PATH = WV_DIR + 'gensim/paragram_300_sl999.gensim'
# W2V_EMBEDDING_PATH = WV_DIR + 'gensim/GoogleNews-vectors-negative300.gensim'
NUM_MODELS = 2
LSTM_UNITS = 128
DENSE_HIDDEN_UNITS = 6 * LSTM_UNITS
NROWS = 10000
BATCH_SIZE = 128

seed_everything()

# 1. Preprocess

## 1.1 Load data & get weights

In [6]:
train = pd.read_csv(DATA_DIR + 'train.csv', nrows=NROWS)
test = pd.read_csv(DATA_DIR + 'test.csv', nrows=NROWS)

x_train = preprocess(train['comment_text'])
y_aux_train = train[['target', 'severe_toxicity', 'obscene', 'identity_attack', 'insult', 'threat']]
x_test = preprocess(test['comment_text'])

identity_columns = [
    'male', 'female', 'homosexual_gay_or_lesbian', 'christian', 'jewish',
    'muslim', 'black', 'white', 'psychiatric_or_mental_illness']

# Overall
weights = np.ones((len(x_train),)) / 4

# Subgroup
weights += (train[identity_columns].fillna(0).values>=0.5).sum(axis=1).astype(bool).astype(int) / 4

# Background Positive, Subgroup Negative
weights += (( (train['target'].values>=0.5).astype(bool).astype(int) +
   (train[identity_columns].fillna(0).values<0.5).sum(axis=1).astype(bool).astype(int) ) > 1 ).astype(bool).astype(int) / 4

# Background Negative, Subgroup Positive
weights += (( (train['target'].values<0.5).astype(bool).astype(int) +
   (train[identity_columns].fillna(0).values>=0.5).sum(axis=1).astype(bool).astype(int) ) > 1 ).astype(bool).astype(int) / 4

loss_weight = 1.0 / weights.mean()

y_train = np.vstack([(train.target.values>=.5).astype(int), weights]).T

## 1.2 Tokenize

In [7]:
x_train_pdd_sequences, x_test_pdd_sequences, lengths, test_lengths, maxlen, word_index = tokenize(x_train, x_test)

del x_train, x_test
gc.collect()

22

## 1.3 Word embedding

In [8]:
embedding_matrix = joblib.load(DATA_DIR + 'four-embedding_matrix.pkl')

## 1.4 Sequence to tensor on CUDA

In [9]:
x_train_torch = torch.tensor(x_train_pdd_sequences, dtype=torch.long)
y_train_torch = torch.tensor(np.hstack([y_train, y_aux_train]), dtype=torch.float32)
x_test_torch = torch.tensor(x_test_pdd_sequences, dtype=torch.long)

# 2. Training

In [10]:
class SpatialDropout(nn.Dropout2d):
    def forward(self, x):
        x = x.unsqueeze(2)
        x = x.permute(0, 3, 2, 1)
        x = super(SpatialDropout, self).forward(x)
        x = x.permute(0, 3, 2, 1)
        x = x.squeeze(2)
        return x
    
    
class NeuralNet(nn.Module):
    def __init__(self, lstm_units, dense_hidden_units, embedding_matrix, output_aux_sub=11):
        super(NeuralNet, self).__init__()
        embed_size = embedding_matrix.shape[1]
        
        self.embedding = nn.Embedding(embedding_matrix.shape[0], embed_size)
        self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float32))
        self.embedding.weight.requires_grad = False
        self.embedding_dropout = SpatialDropout(0.3)
        
        self.lstm1 = nn.LSTM(embed_size, lstm_units, bidirectional=True, batch_first=True)
        self.lstm2 = nn.LSTM(lstm_units * 2, lstm_units, bidirectional=True, batch_first=True)
    
        self.linear1 = nn.Linear(dense_hidden_units, dense_hidden_units)
        self.linear2 = nn.Linear(dense_hidden_units, dense_hidden_units)
        
        self.linear_out = nn.Linear(dense_hidden_units + 6 + output_aux_sub, 1)
        self.linear_aux_out = nn.Linear(dense_hidden_units, 6)
        self.linear_sub_out = nn.Linear(dense_hidden_units, output_aux_sub)
        
    def forward(self, x, lengths=None):
        h_embedding = self.embedding(x)
        h_embedding = self.embedding_dropout(h_embedding)
        
        h_lstm1, _ = self.lstm1(h_embedding)
        h_lstm2, _ = self.lstm2(h_lstm1)
        
        avg_pool1 = torch.mean(h_lstm1, 1)
        avg_pool2 = torch.mean(h_lstm2, 1)
        max_pool2, _ = torch.max(h_lstm2, 1)
        
        h_conc = torch.cat((avg_pool1, max_pool2, avg_pool2), 1)
        h_conc_linear1  = F.relu(self.linear1(h_conc))
        h_conc_linear2  = F.relu(self.linear2(h_conc))
        
        hidden = h_conc + h_conc_linear1 + h_conc_linear2

        aux_result = self.linear_aux_out(hidden)
        sub_result = self.linear_sub_out(hidden)
        result = self.linear_out(torch.cat((hidden, aux_result, sub_result), 1))
        # out = torch.cat([result, aux_result, sub_result], 1)
        out = torch.cat([result, aux_result], 1)
        return out

In [11]:
batch_size = BATCH_SIZE

test_dataset = data.TensorDataset(x_test_torch, test_lengths)
train_dataset = data.TensorDataset(x_train_torch, lengths, y_train_torch)
valid_dataset = data.Subset(train_dataset, indices=[0, 1])

train_collator = SequenceBucketCollator(torch.max, 
                                        sequence_index=0, 
                                        length_index=1, 
                                        label_index=2,
                                        maxlen=maxlen)
test_collator = SequenceBucketCollator(torch.max,
                                       sequence_index=0,
                                       length_index=1,
                                       maxlen=maxlen)

train_loader = data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=train_collator)
valid_loader = data.DataLoader(valid_dataset, batch_size=batch_size, shuffle=False, collate_fn=train_collator)
test_loader = data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=test_collator)

databunch = DataBunch(train_dl=train_loader, valid_dl=valid_loader, collate_fn=train_collator)

def custom_loss(data, targets):
    ''' Define custom loss function for weighted BCE on 'target' column '''
    bce_loss_1 = nn.BCEWithLogitsLoss(weight=targets[:,1:2])(data[:,:1],targets[:,:1])
    bce_loss_2 = nn.BCEWithLogitsLoss()(data[:,1:],targets[:,2:])
    return (bce_loss_1 * loss_weight) + bce_loss_2

all_test_preds = []

for model_idx in range(NUM_MODELS):
    print('Model ', model_idx)
    seed_everything(1234 + model_idx)
    model = NeuralNet(LSTM_UNITS, DENSE_HIDDEN_UNITS, embedding_matrix, y_aux_train.shape[-1])
    learn = Learner(databunch, model, loss_func=custom_loss)
    test_preds = train_model(learn,test_dataset,output_dim=7)    
    all_test_preds.append(test_preds)

Model  0


epoch,train_loss,valid_loss,time
0,0.419193,0.066895,00:07


epoch,train_loss,valid_loss,time
0,0.314579,0.044945,00:07


epoch,train_loss,valid_loss,time
0,0.252707,0.136628,00:07


epoch,train_loss,valid_loss,time
0,0.220086,0.152,00:07


Model  1


epoch,train_loss,valid_loss,time
0,0.44709,0.15373,00:07


epoch,train_loss,valid_loss,time
0,0.296016,0.138784,00:07


epoch,train_loss,valid_loss,time
0,0.249947,0.054794,00:07


epoch,train_loss,valid_loss,time
0,0.217646,0.119369,00:07
