In [1]:
import numpy as np
import pandas as pd
import os
import io
import time
import gc
import random
from tqdm.notebook import tqdm_notebook as tqdm
from keras.preprocessing import text, sequence
import torch
from torch import nn
from torch.utils import data
from torch.nn import functional as F

In [2]:
current_cuda_device = -1
if torch.cuda.is_available():
    current_cuda_device = torch.cuda.current_device()
print(f'Is GPU used? (0=yes, -1=no): {current_cuda_device}')

Is GPU used? (0=yes, -1=no): 0


In [3]:
CRAWL_EMBEDDING_PATH = '/home/konstantina/data/toxic-comments-bias-kaggle/crawl-300d-2M.vec'
GLOVE_EMBEDDING_PATH = '/home/konstantina/data/toxic-comments-bias-kaggle/glove.840B.300d.txt'
NUM_MODELS = 2
LSTM_UNITS = 128
DENSE_HIDDEN_UNITS = 4 * LSTM_UNITS
MAX_LEN = 220

In [4]:
def preprocess(data):
    '''
    Credit goes to https://www.kaggle.com/gpreda/jigsaw-fast-compact-solution
    '''
    punct = "/-'?!.,#$%\'()*+-/:;<=>@[\\]^_`{|}~`" + '""“”’' + '∞θ÷α•à−β∅³π‘₹´°£€\×™√²—–&'
    def clean_special_chars(text, punct):
        for p in punct:
            text = text.replace(p, ' ')
        return text

    data = data.astype(str).apply(lambda x: clean_special_chars(x, punct))
    return data

In [15]:
# load
train = pd.read_csv('/home/konstantina/data/toxic-comments-bias-kaggle/train.csv')
test = pd.read_csv('/home/konstantina/data/toxic-comments-bias-kaggle/test.csv')
test_private = pd.read_csv('/home/konstantina/data/toxic-comments-bias-kaggle/test_private_expanded.csv')
test_public = pd.read_csv('/home/konstantina/data/toxic-comments-bias-kaggle/test_public_expanded.csv')
# id,target,comment_text,severe_toxicity,obscene,identity_attack,insult,threat,asian,atheist,bisexual,black,buddhist,christian,female,heterosexual,hindu,homosexual_gay_or_lesbian,intellectual_or_learning_disability,jewish,latino,male,muslim,other_disability,other_gender,other_race_or_ethnicity,other_religion,other_sexual_orientation,physical_disability,psychiatric_or_mental_illness,transgender,white,created_date,publication_id,parent_id,article_id,rating,funny,wow,sad,likes,disagree,sexual_explicit,identity_annotator_count,toxicity_annotator_count
print(f'Train and test shapes: {train.shape}, {test.shape}')
print(f'Test private and test public shapes: {test_private.shape}, {test_public.shape}')  # all features and binarized toxicity

# preprocess
x_train = preprocess(train['comment_text'])
x_test = preprocess(test['comment_text'])

Train and test shapes: (1804874, 45), (97320, 2)
Test private and test public shapes: (97320, 45), (97320, 45)


In [16]:
# targets
y_train = np.where(train['target'] >= 0.5, 1, 0)
y_aux_train = train[['target', 'severe_toxicity', 'obscene', 'identity_attack', 'insult', 'threat']]
print(f'y_train: {y_train}')


y_train: [0 0 0 ... 0 1 0]


In [17]:
# tokenize and vectorize text
tokenizer = text.Tokenizer()
tokenizer.fit_on_texts(list(x_train) + list(x_test))  # fit both vocabularies
x_train = tokenizer.texts_to_sequences(x_train)  # translate into integers
x_test = tokenizer.texts_to_sequences(x_test)
x_train = sequence.pad_sequences(x_train, maxlen=MAX_LEN)  # pad for balanced text length
x_test = sequence.pad_sequences(x_test, maxlen=MAX_LEN)

In [21]:
vocabulary = None
vocabulary = vocabulary or len(tokenizer.word_index) + 1
print(f'vocabulary: {vocabulary}')

vocabulary: 327009


In [24]:
def get_coefs(word, *arr):
    return word, np.asarray(arr, dtype='float32')

def load_embeddings(path):
    with open(path) as f:
        return dict(get_coefs(*line.strip().split(' ')) for line in tqdm(f))
def build_matrix(word_index, path):
    embedding_index = load_embeddings(path)
    embedding_matrix = np.zeros((len(word_index) + 1, 300))
    unknown_words = []
    for word, i in word_index.items():
        try:
            embedding_matrix[i] = embedding_index[word]
        except KeyError:
            unknown_words.append(word)
    return embedding_matrix, unknown_words

In [None]:
crawl_matrix, unknown_words_crawl = build_matrix(tokenizer.word_index, CRAWL_EMBEDDING_PATH)
print('n unknown words (crawl): ', len(unknown_words_crawl))

0it [00:00, ?it/s]