In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


# 0. Import & utility functions

In [2]:
import os
import re
import gc
import sys
import time
import json
import random
import unicodedata
import multiprocessing
from functools import partial, lru_cache

!pip install emoji
import emoji

import numpy as np
import pandas as pd
import joblib
from tqdm import tqdm, tqdm_notebook

from nltk import TweetTokenizer
from nltk.stem import PorterStemmer, SnowballStemmer
from nltk.stem.lancaster import LancasterStemmer

import torch
from torch import nn
from torch.utils import data
from torch.nn import functional as F
from gensim.models import KeyedVectors
from keras.preprocessing import text, sequence
from keras.preprocessing.sequence import pad_sequences

from fastai.train import Learner
from fastai.train import DataBunch
from fastai.callbacks import GeneralScheduler, TrainingPhase
from fastai.basic_data import DatasetType

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting emoji
  Downloading emoji-1.7.0.tar.gz (175 kB)
[K     |████████████████████████████████| 175 kB 11.8 MB/s 
[?25hBuilding wheels for collected packages: emoji
  Building wheel for emoji (setup.py) ... [?25l[?25hdone
  Created wheel for emoji: filename=emoji-1.7.0-py3-none-any.whl size=171046 sha256=0461f097b1ec9552193c46d04917f8c75760b4ce46dca339fa2aa5a83ef92d7c
  Stored in directory: /root/.cache/pip/wheels/8a/4e/b6/57b01db010d17ef6ea9b40300af725ef3e210cb1acfb7ac8b6
Successfully built emoji
Installing collected packages: emoji
Successfully installed emoji-1.7.0


In [3]:
import sys
sys.path.insert(0, 'drive/MyDrive/NLP/ENG/Jigsaw2/')

from moong_util import BASE_DIR, WV_DIR, MODEL_DIR, DATA_DIR, OUTPUT_DIR
from moong_util import seed_everything
from moong_embedding import build_matrix
from moong_lstm import JigsawNeuralNetV2 as NeuralNet
from moong_preprocessing import preprocess_v2 as preprocess
# from moong_tokenize import tokenize_v2 as tokenize
from moong_train import train_model_v2 as train_model
from moong_collator import SequenceBucketCollator

In [4]:
CRAWL_EMBEDDING_PATH = WV_DIR + 'fasttext/crawl-300d-2M.vec'
GLOVE_EMBEDDING_PATH = WV_DIR + 'glove/glove.840B.300d.txt'
NUM_MODELS = 2
LSTM_UNITS = 128
DENSE_HIDDEN_UNITS = 4 * LSTM_UNITS
NROWS = 10000
BATCH_SIZE = 128

seed_everything()

# 1. Preprocessing

## 1.1 Load data & get weights

In [5]:
train = pd.read_csv(DATA_DIR + 'train.csv', nrows=NROWS)
test = pd.read_csv(DATA_DIR + 'test.csv', nrows=NROWS)

x_train = preprocess(train['comment_text'])
y_aux_train = train[['target', 'severe_toxicity', 'obscene', 'identity_attack', 'insult', 'threat']]
x_test = preprocess(test['comment_text'])

identity_columns = [
    'male', 'female', 'homosexual_gay_or_lesbian', 'christian', 'jewish',
    'muslim', 'black', 'white', 'psychiatric_or_mental_illness']

# Overall
weights = np.ones((len(x_train),)) / 4

# Subgroup
weights += (train[identity_columns].fillna(0).values>=0.5).sum(axis=1).astype(bool).astype(int) / 4

# Background Positive, Subgroup Negative
weights += (( (train['target'].values>=0.5).astype(bool).astype(int) +
   (train[identity_columns].fillna(0).values<0.5).sum(axis=1).astype(bool).astype(int) ) > 1 ).astype(bool).astype(int) / 4

# Background Negative, Subgroup Positive
weights += (( (train['target'].values<0.5).astype(bool).astype(int) +
   (train[identity_columns].fillna(0).values>=0.5).sum(axis=1).astype(bool).astype(int) ) > 1 ).astype(bool).astype(int) / 4

loss_weight = 1.0 / weights.mean()

y_train = np.vstack([(train.target.values>=.5).astype(int), weights]).T

##  1.2 Tokenize

In [7]:
def tokenize_v3(x_train, x_test):
    tknzr = TweetTokenizer(strip_handles=True, reduce_len=True)

    train_word_sequences = []
    test_word_sequences = []
    word_dict = {}
    word_index = 1

    for doc in list(x_train):
        word_seq = []
        for token in tknzr.tokenize(doc):
            if token not in word_dict:
                word_dict[token] = word_index
                word_index += 1
            word_seq.append(word_dict[token])
        train_word_sequences.append(word_seq)

    for doc in list(x_test):
        word_seq = []
        for token in tknzr.tokenize(doc):
            if token not in word_dict:
                word_dict[token] = word_index
                word_index += 1
            word_seq.append(word_dict[token])
        test_word_sequences.append(word_seq)

    lengths = torch.from_numpy(np.array([len(x) for x in train_word_sequences]))
    test_lengths = torch.from_numpy(np.array([len(x) for x in test_word_sequences]))

    maxlen = lengths.max()

    x_train_pdd_sequences = sequence.pad_sequences(train_word_sequences, maxlen=maxlen)
    x_test_pdd_sequences = sequence.pad_sequences(test_word_sequences, maxlen=maxlen)

    return x_train_pdd_sequences, x_test_pdd_sequences, lengths, test_lengths, maxlen, word_dict

x_train_pdd_sequences, x_test_pdd_sequences, lengths, test_lengths, maxlen, word_index = tokenize_v3(x_train, x_test)

del x_train, x_test
gc.collect()

0

## 1.3 Word embedding

In [11]:
crawl_matrix, unknown_words_crawl = build_matrix(word_index, CRAWL_EMBEDDING_PATH)
print('n unknown words (crawl): ', len(unknown_words_crawl))

glove_matrix, unknown_words_glove = build_matrix(word_index, GLOVE_EMBEDDING_PATH)
print('n unknown words (glove): ', len(unknown_words_glove))

embedding_matrix = np.concatenate([crawl_matrix, glove_matrix], axis=-1)
print(f'embedding_matrix.shape = {embedding_matrix.shape}')

del crawl_matrix
del glove_matrix
gc.collect()

0it [00:00, ?it/s]

n unknown words (crawl):  5783


0it [00:00, ?it/s]

n unknown words (glove):  5807
embedding_matrix.shape = (50519, 600)


42

## 1.4 Sequence to tensor on CUDA

In [12]:
x_train_torch = torch.tensor(x_train_pdd_sequences, dtype=torch.long)
y_train_torch = torch.tensor(np.hstack([y_train, y_aux_train]), dtype=torch.float32)
x_test_torch = torch.tensor(x_test_pdd_sequences, dtype=torch.long)

# 2. Training

In [13]:
batch_size = BATCH_SIZE

test_dataset = data.TensorDataset(x_test_torch, test_lengths)
train_dataset = data.TensorDataset(x_train_torch, lengths, y_train_torch)
valid_dataset = data.Subset(train_dataset, indices=[0, 1])

train_collator = SequenceBucketCollator(torch.max, 
                                        sequence_index=0, 
                                        length_index=1, 
                                        label_index=2,
                                        maxlen=maxlen)
test_collator = SequenceBucketCollator(torch.max,
                                       sequence_index=0,
                                       length_index=1,
                                       maxlen=maxlen)

train_loader = data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=train_collator)
valid_loader = data.DataLoader(valid_dataset, batch_size=batch_size, shuffle=False, collate_fn=train_collator)
test_loader = data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=test_collator)

databunch = DataBunch(train_dl=train_loader, valid_dl=valid_loader, collate_fn=train_collator)

def custom_loss(data, targets):
    ''' Define custom loss function for weighted BCE on 'target' column '''
    bce_loss_1 = nn.BCEWithLogitsLoss(weight=targets[:,1:2])(data[:,:1],targets[:,:1])
    bce_loss_2 = nn.BCEWithLogitsLoss()(data[:,1:],targets[:,2:])
    return (bce_loss_1 * loss_weight) + bce_loss_2

all_test_preds = []

for model_idx in range(NUM_MODELS):
    print('Model ', model_idx)
    seed_everything(1234 + model_idx)
    model = NeuralNet(LSTM_UNITS, DENSE_HIDDEN_UNITS, embedding_matrix, y_aux_train.shape[-1])
    learn = Learner(databunch, model, loss_func=custom_loss)
    test_preds = train_model(learn,test_dataset,output_dim=7)    
    all_test_preds.append(test_preds)

Model  0


epoch,train_loss,valid_loss,time
0,0.437649,0.018945,00:05


epoch,train_loss,valid_loss,time
0,0.310832,0.042102,00:05


epoch,train_loss,valid_loss,time
0,0.284462,0.056768,00:05


epoch,train_loss,valid_loss,time
0,0.245954,0.049778,00:05


Model  1


epoch,train_loss,valid_loss,time
0,0.428557,0.128577,00:05


epoch,train_loss,valid_loss,time
0,0.309178,0.039319,00:05


epoch,train_loss,valid_loss,time
0,0.285284,0.014043,00:05


epoch,train_loss,valid_loss,time
0,0.262639,0.011312,00:05
