In [59]:
import numpy as np
import pandas as pd
import os
import io
import time
import gc
import random
import torch
from torch import nn
from torch.utils import data
from torch.nn import functional as F
import matplotlib

In [61]:
import torch

current_cuda_device = -1
if torch.cuda.is_available():
    current_cuda_device = torch.cuda.current_device()
print(f'Is GPU used? (0=yes, -1=no): {current_cuda_device}')

Is GPU used? (0=yes, -1=no): -1


In [None]:
CRAWL_EMBEDDING_PATH = '/home/klazaridou/projects/jigsaw-unintended-bias-in-toxicity-classification/crawl-300d-2M.vec'
GLOVE_EMBEDDING_PATH = '/home/klazaridou/projects/jigsaw-unintended-bias-in-toxicity-classification/glove.840B.300d.txt'

In [None]:
def preprocess(data):
    '''
    Credit goes to https://www.kaggle.com/gpreda/jigsaw-fast-compact-solution
    '''
    punct = "/-'?!.,#$%\'()*+-/:;<=>@[\\]^_`{|}~`" + '""“”’' + '∞θ÷α•à−β∅³π‘₹´°£€\×™√²—–&'
    def clean_special_chars(text, punct):
        for p in punct:
            text = text.replace(p, ' ')
        return text

    data = data.astype(str).apply(lambda x: clean_special_chars(x, punct))
    return data

In [None]:
# load data
train = pd.read_csv('/home/klazaridou/projects/jigsaw-unintended-bias-in-toxicity-classification/train.csv')
test = pd.read_csv('/home/klazaridou/projects/jigsaw-unintended-bias-in-toxicity-classification/test.csv')
test_private = pd.read_csv('/home/klazaridou/projects/jigsaw-unintended-bias-in-toxicity-classification/test_private_expanded.csv')
test_public = pd.read_csv('/home/klazaridou/projects/jigsaw-unintended-bias-in-toxicity-classification/test_public_expanded.csv')
# id,target,comment_text,severe_toxicity,obscene,identity_attack,insult,threat,asian,atheist,bisexual,black,buddhist,christian,female,heterosexual,hindu,homosexual_gay_or_lesbian,intellectual_or_learning_disability,jewish,latino,male,muslim,other_disability,other_gender,other_race_or_ethnicity,other_religion,other_sexual_orientation,physical_disability,psychiatric_or_mental_illness,transgender,white,created_date,publication_id,parent_id,article_id,rating,funny,wow,sad,likes,disagree,sexual_explicit,identity_annotator_count,toxicity_annotator_count
print(f'Train and test shapes: {train.shape}, {test.shape}')
print(f'Test private and test public shapes: {test_private.shape}, {test_public.shape}')  # all features and binarized toxicity

In [None]:
# preprocess text
x_train = preprocess(train['comment_text'])
x_test = preprocess(test['comment_text'])

In [None]:
# get targets
y_train = np.where(train['target'] >= 0.5, 1, 0)
y_aux_train = train[['target', 'severe_toxicity', 'obscene', 'identity_attack', 'insult', 'threat']]
print(f'y_train: {y_train}')
print(f'y_aux_train: {y_aux_train}')


In [None]:
print(f'Negative examples: {np.histogram(y_train)[0][0]}')

In [None]:
print(f'Positive examples: {np.histogram(y_train)[0][9]}')

In [None]:
# get fewer data for efficiency
# print(f'Training data example row: {train.iloc[[2]]}')
train_small = train.sample(n=100000, weights='target')
targets = train_small['target']
print(f'Training small hist: {targets.hist(bins=2)}')
print(f'Small training data shape: {targets.shape}')

In [None]:
# preprocess text small dataset
x_train = preprocess(train_small['comment_text'])

In [None]:
# get targets for small dataset
y_train = np.where(train_small['target'] >= 0.5, 1, 0)
y_aux_train = train_small[['target', 'severe_toxicity', 'obscene', 'identity_attack', 'insult', 'threat']]
print(f'y_train: {y_train}')
print(f'y_aux_train: {y_aux_train}')

In [None]:
print(f'Negative examples: {np.histogram(y_train)[0][0]}')

In [None]:
print(f'Positive examples: {np.histogram(y_train)[0][9]}')

In [None]:
# tokenize and vectorize text
from keras.preprocessing import text, sequence # works with tensorflow>=2.7
tokenizer = text.Tokenizer()
tokenizer.fit_on_texts(list(x_train) + list(x_test))  # fit both vocabularies
x_train = tokenizer.texts_to_sequences(x_train)  # translate into integers
x_test = tokenizer.texts_to_sequences(x_test)
x_train = sequence.pad_sequences(x_train, maxlen=MAX_LEN)  # pad for balanced text length
x_test = sequence.pad_sequences(x_test, maxlen=MAX_LEN)

In [None]:
vocabulary = None
vocabulary = vocabulary or len(tokenizer.word_index) + 1
print(f'words in vocabulary: {vocabulary}')

In [None]:
# build embedding matrix
from tqdm.notebook import tqdm_notebook as tqdm
def get_coefs(word, *arr):
    return word, np.asarray(arr, dtype='float32')

def load_embeddings(path):
    with open(path) as f:
        return dict(get_coefs(*line.strip().split(' ')) for line in tqdm(f))
def build_matrix(word_index, path):
    embedding_index = load_embeddings(path)
    embedding_matrix = np.zeros((len(word_index) + 1, 300))
    unknown_words = []
    for word, i in word_index.items():
        try:
            embedding_matrix[i] = embedding_index[word]
        except KeyError:
            unknown_words.append(word)
    return embedding_matrix, unknown_words

In [None]:
crawl_matrix, unknown_words_crawl = build_matrix(tokenizer.word_index, CRAWL_EMBEDDING_PATH)

In [None]:
print('unknown words (crawl): ', len(unknown_words_crawl))
print(f'crawl_matrix: {crawl_matrix.shape}')

In [None]:
counter = 0
for word, i in tokenizer.word_index.items():
    print(f'i: {word}: embedding len: {len(crawl_matrix[i])} ')
    counter += 1
    if counter == 1:
        break;

In [None]:
glove_matrix, unknown_words_glove = build_matrix(tokenizer.word_index, GLOVE_EMBEDDING_PATH)

In [None]:
print('unknown words (glove): ', len(unknown_words_glove))
print(f'glove_matrix: {glove_matrix.shape}')

In [None]:
counter = 0
for word, i in tokenizer.word_index.items():
    print(f'i: {word}: embedding len: {len(glove_matrix[i])} ')
    counter += 1
    if counter == 1:
        break;

In [None]:
embedding_matrix = np.concatenate([crawl_matrix, glove_matrix], axis=-1)  
# TODO: pad not common words because length is 300
print(f'concatanated matrix: {embedding_matrix.shape}')
del crawl_matrix
del glove_matrix
gc.collect()

In [None]:
counter = 0
for word, i in tokenizer.word_index.items():
    print(f'i: {word}: embedding len: {len(embedding_matrix[i])} ')
    counter += 1
    if counter == 1:
        break;


In [None]:
NUM_MODELS = 1
LSTM_UNITS = 2
DENSE_HIDDEN_UNITS = 4 * LSTM_UNITS
MAX_LEN = 50

In [None]:
# define model
class SpatialDropout(nn.Dropout2d):
    def forward(self, x):
        x = x.unsqueeze(2)    # (N, T, 1, K)
        x = x.permute(0, 3, 2, 1)  # (N, K, 1, T)
        x = super(SpatialDropout, self).forward(x)  # (N, K, 1, T), some features are masked
        x = x.permute(0, 3, 2, 1)  # (N, T, 1, K)
        x = x.squeeze(2)  # (N, T, K)
        return x
    
class NeuralNet(nn.Module):
    def __init__(self, embedding_matrix, num_aux_targets):
        super(NeuralNet, self).__init__()
        embed_size = embedding_matrix.shape[1]
        
        self.embedding = nn.Embedding(vocabulary, embed_size)
        self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float32))
        self.embedding.weight.requires_grad = False
        self.embedding_dropout = SpatialDropout(0.3)
        
        self.lstm1 = nn.LSTM(embed_size, LSTM_UNITS, bidirectional=True, batch_first=True)
        self.lstm2 = nn.LSTM(LSTM_UNITS * 2, LSTM_UNITS, bidirectional=True, batch_first=True)
    
        self.linear1 = nn.Linear(DENSE_HIDDEN_UNITS, DENSE_HIDDEN_UNITS)
        self.linear2 = nn.Linear(DENSE_HIDDEN_UNITS, DENSE_HIDDEN_UNITS)
        
        self.linear_out = nn.Linear(DENSE_HIDDEN_UNITS, 1)
        self.linear_aux_out = nn.Linear(DENSE_HIDDEN_UNITS, num_aux_targets)
        
    def forward(self, x):
        h_embedding = self.embedding(x)
        h_embedding = self.embedding_dropout(h_embedding)
        
        h_lstm1, _ = self.lstm1(h_embedding)
        h_lstm2, _ = self.lstm2(h_lstm1)
        
        # global average pooling
        avg_pool = torch.mean(h_lstm2, 1)
        # global max pooling
        max_pool, _ = torch.max(h_lstm2, 1)
        
        h_conc = torch.cat((max_pool, avg_pool), 1)
        h_conc_linear1  = F.relu(self.linear1(h_conc))
        h_conc_linear2  = F.relu(self.linear2(h_conc))
        
        hidden = h_conc + h_conc_linear1 + h_conc_linear2
        
        result = self.linear_out(hidden)
        aux_result = self.linear_aux_out(hidden)
        out = torch.cat([result, aux_result], 1)
        
        return out

In [None]:
# transform to tensors and then datasets
x_train_torch = torch.tensor(x_train, dtype=torch.long)
x_test_torch = torch.tensor(x_test, dtype=torch.long)
y_train_torch = torch.tensor(np.hstack([y_train[:, np.newaxis], y_aux_train]), dtype=torch.float32)
print(f'y_train_torch: {y_train_torch}')
train_dataset = data.TensorDataset(x_train_torch, y_train_torch)
test_dataset = data.TensorDataset(x_test_torch)

In [None]:
# training
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def train_model(model, train, test, loss_fn, output_dim, lr=0.001,
                batch_size=32, n_epochs=2,
                enable_checkpoint_ensemble=True):
    # lr=0.001, batch_size=512, n_epochs=4
    param_lrs = [{'params': param, 'lr': lr} for param in model.parameters()]
    optimizer = torch.optim.Adam(param_lrs, lr=lr)

    scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lambda epoch: 0.6 ** epoch)
    
    train_loader = torch.utils.data.DataLoader(train, batch_size=batch_size, shuffle=True)
    test_loader = torch.utils.data.DataLoader(test, batch_size=batch_size, shuffle=False)
    all_test_preds = []
    checkpoint_weights = [2 ** epoch for epoch in range(n_epochs)]
    
    for epoch in range(n_epochs):
        start_time = time.time()
        
        scheduler.step()
        
        model.train()
        avg_loss = 0.
        
        for data in tqdm(train_loader, disable=False):
            x_batch = data[:-1]
            y_batch = data[-1]

            y_pred = model(*x_batch)            
            loss = loss_fn(y_pred, y_batch)

            optimizer.zero_grad()
            loss.backward()

            optimizer.step()
            avg_loss += loss.item() / len(train_loader)
            
        model.eval()
        test_preds = np.zeros((len(test), output_dim))
    
        for i, x_batch in enumerate(test_loader):
            y_pred = sigmoid(model(*x_batch).detach().cpu().numpy())

            test_preds[i * batch_size:(i+1) * batch_size, :] = y_pred

        all_test_preds.append(test_preds)
        elapsed_time = time.time() - start_time
        print('Epoch {}/{} \t loss={:.4f} \t time={:.2f}s'.format(
              epoch + 1, n_epochs, avg_loss, elapsed_time))

    if enable_checkpoint_ensemble:
        test_preds = np.average(all_test_preds, weights=checkpoint_weights, axis=0)    
    else:
        test_preds = all_test_preds[-1]
        
    return test_preds

In [None]:
all_test_preds = []

def seed_everything(seed=1234):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
for model_idx in range(NUM_MODELS):
    print('Model ', model_idx)
    seed_everything(1234 + model_idx)
    
    model = NeuralNet(embedding_matrix, y_aux_train.shape[-1])
    # model.cuda()
    
    test_preds = train_model(model, train_dataset, test_dataset, output_dim=y_train_torch.shape[-1], 
                             loss_fn=nn.BCEWithLogitsLoss(reduction='mean'))
    all_test_preds.append(test_preds)
    print()