**Introduction**: With this kernel, I wished to experiment with the effects of feature engineering on a model designed to minimize bias in toxicity detection using BERT Embeddings + LSTM. 

A series of functions used in the kernel are drawn from the original BERT Embeddings + LSTM kernel by Dieter https://www.kaggle.com/christofhenkel/bert-embeddings-lstm/. They are credited to the owner wherever possible.

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
import os
import gc
import re
import numpy as np 
import pandas as pd 
from tqdm import tqdm, trange
import pickle
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
# pytorch bert imports
from pytorch_pretrained_bert import BertTokenizer
from pytorch_pretrained_bert.modeling import BertModel
# keras imports
from keras.utils import np_utils
from keras.preprocessing import text, sequence
from keras.layers import CuDNNLSTM, Activation, Dense, Dropout, Input, Embedding, concatenate, Bidirectional
from keras.optimizers import Adam
from keras.models import Sequential, Model
from keras.callbacks import EarlyStopping, ModelCheckpoint, LearningRateScheduler
from keras.layers import SpatialDropout1D, Dropout, add, GlobalMaxPooling1D, GlobalAveragePooling1D
from keras.losses import binary_crossentropy
from keras import backend as K
import keras.layers as L
from keras.engine.topology import Layer
from keras import initializers, regularizers, constraints, optimizers, layers

Using TensorFlow backend.


In [2]:
print(os.listdir("../input"))
# Any results you write to the current directory are saved as output.

['tunizi-icompass', 'jigsaw-unintended-bias-in-toxicity-classification', 'bert-base-uncased']


In [3]:
BERT_PRETRAINED_DIR = '../input/bert-base-uncased'
#INPUT_DIR = '../input/jigsaw-unintended-bias-in-toxicity-classification/'
BERT_VOCAB_DIR = '../input/bert-base-uncased/vocab.txt'
MAX_LENGTH = 250

In [4]:
# Getting the bert encoded training and test data
train_data = pd.read_csv('../input/tunizi-icompass/tunizi_train.txt')
test_data = train_data.sample(frac=0.3,random_state=200)
train_data = train_data.drop(test_data.index)
#test_data = pd.read_csv(INPUT_DIR + 'test.csv')

In [5]:
train_data

Unnamed: 0,ID,text,label
0,21037,alah yara7me,1
1,46442,brabi atini najah wahed amalta fi akaber korat...,1
2,45602,bravo slouma walah rajel,1
3,30855,elboutoula ma nefhem chay,1
4,19151,ma7laa zinkk,1
7,99054,sama7ha ejam3iya fi floussek kan te7eb leclub,0
8,87809,thawra 3amlouha kamchaa chbek batala dalsaltou...,-1
9,39542,slouma chere president on vous soutient,1
10,83510,brass omik t3alam elbis walahi ent5abtik w tga...,-1
11,29620,cha3b ca m3a slim,1


In [6]:
test_data

Unnamed: 0,ID,text,label
37391,48417,bravo ama bouk miboun,1
14492,14774,haya si slim hana kolna em3ak,1
2925,32197,j aime et merci fi 5ater a7la jam3iya ca,1
56041,50490,boutoula beb soui9a wel be9i maya3nina,1
62963,42989,allah yara7mo inna lilleh wa inna ilayhi raji3...,1
58887,22306,bravo si slim bon courage tous avec vous e rab...,1
67183,52209,baby boy rabi i5aless wa7lek,1
69841,94089,nsalfouk chwaya joueret,-1
66025,33319,mabrouuuuk rabi yhanikom très belle photo,1
65906,69993,ejoueur wino taw tiham 9alou fi parc l ihoud e...,-1


In [7]:
"""# Feature Engineering for the training data
regex = re.compile('[@_!#$%^&*()<>?/\|}{~:]')
train_data['capitals'] = train_data['text'].apply(lambda x: sum(1 for c in x if c.isupper()))
train_data['exclamation_points'] = train_data['comment_text'].apply(lambda x: len(regex.findall(x)))
train_data['total_length'] = train_data['comment_text'].apply(len)

# Feature Engineering for the test data
test_data['capitals'] = test_data['comment_text'].apply(lambda x: sum(1 for c in x if c.isupper()))
test_data['exclamation_points'] = test_data['comment_text'].apply(lambda x: len(regex.findall(x)))
test_data['total_length'] = test_data['comment_text'].apply(len)"""

"# Feature Engineering for the training data\nregex = re.compile('[@_!#$%^&*()<>?/\\|}{~:]')\ntrain_data['capitals'] = train_data['text'].apply(lambda x: sum(1 for c in x if c.isupper()))\ntrain_data['exclamation_points'] = train_data['comment_text'].apply(lambda x: len(regex.findall(x)))\ntrain_data['total_length'] = train_data['comment_text'].apply(len)\n\n# Feature Engineering for the test data\ntest_data['capitals'] = test_data['comment_text'].apply(lambda x: sum(1 for c in x if c.isupper()))\ntest_data['exclamation_points'] = test_data['comment_text'].apply(lambda x: len(regex.findall(x)))\ntest_data['total_length'] = test_data['comment_text'].apply(len)"

In [8]:
"""new_features = ['capitals','exclamation_points','total_length']"""
identity_columns = ['male','female','homosexual_gay_or_lesbian','christian','jewish','muslim',
                    'black','white','psychiatric_or_mental_illness']

In [9]:
"""# Customizing the weights
y_ids= (train_data[identity_columns] >= 0.5).astype(int).values
# Overall
weights = np.ones((len(train_data),)) / 4
# Subgroup
weights += (train_data[identity_columns].fillna(0).values>=0.5).sum(axis=1).astype(bool).astype(np.int) / 4
# Background Positive, Subgroup Negative
weights += (( (train_data['target'].values>=0.5).astype(bool).astype(np.int) +
   (train_data[identity_columns].fillna(0).values<0.5).sum(axis=1).astype(bool).astype(np.int) ) > 1 ).astype(bool).astype(np.int) / 4
# Background Negative, Subgroup Positive
weights += (( (train_data['target'].values<0.5).astype(bool).astype(np.int) +
   (train_data[identity_columns].fillna(0).values>=0.5).sum(axis=1).astype(bool).astype(np.int) ) > 1 ).astype(bool).astype(np.int) / 4
loss_weight = 1.0 / weights.mean()"""

y_train = np.vstack([(train_data['label'].values).astype(np.int)]).T
y_aux_train = train_data[['label']].values


In [10]:
"""#Conversion of continuous target columns to categorical
for column in identity_columns + ['target']:
    train_data[column]= np.where(train_data[column] >= 0.5, True, False)"""

"#Conversion of continuous target columns to categorical\nfor column in identity_columns + ['target']:\n    train_data[column]= np.where(train_data[column] >= 0.5, True, False)"

In [11]:
def nlp_preprocessing(text):
    filter_char = '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n\r“”’\'∞θ÷α•à−β∅³π‘₹´°£€\×™√²—'
    text = text.lower()
    text = text.replace(filter_char,'')
    text = text.replace('[^a-zA-Z0-9 ]', '')
    return text

In [12]:
train_data['text'] = train_data['text'].apply(nlp_preprocessing)
test_data['text'] = test_data['text'].apply(nlp_preprocessing)

In [13]:
# Initialising BERT tokenizer
tokenizer = BertTokenizer(vocab_file='../input/bert-base-uncased/vocab.txt')
def tokenization(row):
    row = tokenizer.tokenize(row)
    row = tokenizer.convert_tokens_to_ids(row)
    return row

In [14]:
train_data['text'] = train_data['text'].apply(tokenization)
test_data['text'] = test_data['text'].apply(tokenization)

In [15]:
def string_ids(doc):
    doc = [str(i) for i in doc]
    return ' '.join(doc)
train_data['text'] = train_data['text'].apply(string_ids)
test_data['text'] = test_data['text'].apply(string_ids)

In [16]:
x_train = np.zeros((train_data.shape[0],MAX_LENGTH),dtype=np.int)

for i,ids in tqdm(enumerate(list(train_data['text']))):
    input_ids = [int(i) for i in ids.split()[:MAX_LENGTH]]
    inp_len = len(input_ids)
    x_train[i,:inp_len] = np.array(input_ids)
    
x_test = np.zeros((test_data.shape[0],MAX_LENGTH),dtype=np.int)

for i,ids in tqdm(enumerate(list(test_data['text']))):

    input_ids = [int(i) for i in ids.split()[:MAX_LENGTH]]
    inp_len = len(input_ids)
    x_test[i,:inp_len] = np.array(input_ids)
    
with open('temporary.pickle', mode='wb') as f:
    pickle.dump(x_test, f) # use temporary file to reduce memory

# Removing extra variables to free up the memory
del x_test
del test_data
del train_data

gc.collect()

49000it [00:00, 63278.18it/s]
21000it [00:00, 64399.19it/s]


0

In [17]:
def custom_loss_func(y_true, y_preds):
    loss = binary_crossentropy(K.reshape(y_true[:,0],(-1,1)), y_preds) * y_true[:,1]
    return loss

In [18]:
def get_bert_embed_matrix():
    bert = BertModel.from_pretrained(BERT_PRETRAINED_DIR)
    bert_embeddings = list(bert.children())[0]
    bert_word_embeddings = list(bert_embeddings.children())[0]
    mat = bert_word_embeddings.weight.data.numpy()
    return mat

In [19]:
embedding_matrix = get_bert_embed_matrix()
embedding_matrix

array([[-0.01018257, -0.06154883, -0.02649689, ..., -0.01985357,
        -0.03720997, -0.00975152],
       [-0.01170495, -0.06002603, -0.03233192, ..., -0.01681456,
        -0.04009988, -0.0106634 ],
       [-0.01975381, -0.06273633, -0.03262176, ..., -0.01650258,
        -0.04198876, -0.00323178],
       ...,
       [-0.02176224, -0.0556396 , -0.01346345, ..., -0.00432698,
        -0.0151355 , -0.02489496],
       [-0.04617237, -0.05647721, -0.00192082, ...,  0.01568751,
        -0.01387033, -0.00945213],
       [ 0.00145601, -0.08208051, -0.01597912, ..., -0.00811687,
        -0.04746607,  0.07527421]], dtype=float32)

In [20]:
def build_model(embedding_matrix, num_aux_targets):
    '''
    credits go to: https://www.kaggle.com/thousandvoices/simple-lstm/
    '''
    words = Input(shape=(MAX_LENGTH,))
    x = Embedding(*embedding_matrix.shape, weights=[embedding_matrix], trainable=False)(words)
    x = SpatialDropout1D(0.5)(x)
    x = Bidirectional(CuDNNLSTM(LSTM_UNITS, return_sequences=True))(x)
    x = Bidirectional(CuDNNLSTM(LSTM_UNITS, return_sequences=True))(x)

    hidden = concatenate([GlobalMaxPooling1D()(x),GlobalAveragePooling1D()(x),])
    hidden = add([hidden, Dense(HIDDEN_UNITS, activation='relu')(hidden)])
    hidden = add([hidden, Dense(HIDDEN_UNITS, activation='relu')(hidden)])
    result = Dense(1, activation='sigmoid')(hidden)
    aux_result = Dense(num_aux_targets, activation='sigmoid')(hidden)
    
    model = Model(inputs=words, outputs=[result, aux_result])
    model.compile(loss=[custom_loss_func,'binary_crossentropy'],
                  optimizer=Adam(lr = 0.001))

    return model

In [21]:
tr_idx, val_idx = train_test_split(list(range(len(x_train))) ,test_size = 0.05, random_state = 42)

In [22]:
epochs = 5
LSTM_UNITS = 128
HIDDEN_UNITS = 4 * LSTM_UNITS
model_predictions = []
model_val_preds = []
weights = []

# Model Training and Prediction Phase
model = build_model(embedding_matrix, y_aux_train.shape[-1])
for epoch in range(epochs):
    model.fit(x_train[tr_idx],[y_train[tr_idx], y_aux_train[tr_idx]],
              validation_data = (x_train[val_idx],[y_train[val_idx], y_aux_train[val_idx]]),
              batch_size=512,
              epochs=1,
              verbose=1,
              callbacks=[LearningRateScheduler(lambda epoch: 1e-3 * (0.6 ** epoch))])
    with open('temporary.pickle', mode='rb') as f:
        x_test = pickle.load(f) 
    model_predictions.append(model.predict(x_test, batch_size=2048)[0].flatten())
    model_val_preds.append(model.predict(x_train[val_idx], batch_size=2048)[0].flatten())
    del x_test
    gc.collect()
    weights.append(2 ** epoch)
del model
gc.collect()

Train on 46550 samples, validate on 2450 samples
Epoch 1/1


InvalidArgumentError: 2 root error(s) found.
  (0) Invalid argument: slice index 1 of dimension 1 out of bounds.
	 [[{{node loss/dense_3_loss/strided_slice_1}}]]
	 [[loss/dense_4_loss/Mean_3/_253]]
  (1) Invalid argument: slice index 1 of dimension 1 out of bounds.
	 [[{{node loss/dense_3_loss/strided_slice_1}}]]
0 successful operations.
0 derived errors ignored.

In [23]:
val_preds = np.average(model_val_preds, weights = weights, axis = 0)

ZeroDivisionError: Weights sum to zero, can't be normalized

In [24]:
""" Following section is drawn from a set of functions used on https://www.kaggle.com/christofhenkel/bert-embeddings-lstm/ """

from sklearn.metrics import roc_auc_score

def get_s_auc(y_true,y_pred,y_identity):
    mask = y_identity==1
    try:
        s_auc = roc_auc_score(y_true[mask],y_pred[mask])
    except:
        s_auc = 1
    return s_auc

def get_bspn_auc(y_true,y_pred,y_identity):
    mask = (y_identity==1) & (y_true==1) | (y_identity==0) & (y_true==0)
    try:
        bspn_auc = roc_auc_score(y_true[mask],y_pred[mask])
    except:
        bspn_auc = 1
    return bspn_auc

def get_bpsn_auc(y_true,y_pred,y_identity):
    mask = (y_identity==1) & (y_true==0) | (y_identity==0) & (y_true==1)
    try:
        bpsn_auc = roc_auc_score(y_true[mask],y_pred[mask])
    except:
        bpsn_auc = 1
    return bpsn_auc

def get_total_auc(y_true,y_pred,y_identities):
    N = y_identities.shape[1]
    
    saucs = np.array([get_s_auc(y_true,y_pred,y_identities[:,i]) for i in range(N)])
    bpsns = np.array([get_bpsn_auc(y_true,y_pred,y_identities[:,i]) for i in range(N)])
    bspns = np.array([get_bspn_auc(y_true,y_pred,y_identities[:,i]) for i in range(N)])

    M_s_auc = np.power(np.mean(np.power(saucs, -5)),1/-5)
    M_bpsns_auc = np.power(np.mean(np.power(bpsns, -5)),1/-5)
    M_bspns_auc = np.power(np.mean(np.power(bspns, -5)),1/-5)
    r_auc = roc_auc_score(y_true,y_pred)
    
    total_auc = M_s_auc + M_bpsns_auc + M_bspns_auc + r_auc
    total_auc/= 4

    return total_auc

get_total_auc(y_train[val_idx][:,0],val_preds,y_ids[val_idx])

NameError: name 'val_preds' is not defined

#### Submission Stage:

In [25]:
# Calculate average predictions for the model
predictions = np.average(model_predictions, weights=weights, axis=0)

df_submission = pd.read_csv('../input/jigsaw-unintended-bias-in-toxicity-classification/test.csv')
df_submission.drop(['comment_text'],axis = 1, inplace = True)
df_submission['prediction'] = predictions
df_submission.to_csv('submission.csv', index=False)

ZeroDivisionError: Weights sum to zero, can't be normalized

---------------------------------------------------------------------------------------------------------------------------------