In [1]:
# import module we'll need to import our custom module
from shutil import copyfile

# copy our file into the working directory (make sure it has .py suffix)
copyfile(src = "../input/berttokenization/berttokenization.py", dst = "../working/berttokenization.py")

'../working/berttokenization.py'

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GroupKFold
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import tensorflow_hub as hub
import tensorflow as tf
import berttokenization as tokenization
import tensorflow.keras.backend as K
import gc
import os
from scipy.stats import spearmanr
from math import floor, ceil

np.set_printoptions(suppress=True)

#### 1. Read data and tokenizer

Read tokenizer and data, as well as defining the maximum sequence length that will be used for the input to Bert (maximum is usually 512 tokens)

In [3]:
PATH = '../input/google-quest-challenge/'
BERT_PATH = '../input/bert-base-from-tfhub/bert_en_uncased_L-12_H-768_A-12'
tokenizer = tokenization.FullTokenizer(BERT_PATH+'/assets/vocab.txt', True)
MAX_SEQUENCE_LENGTH = 512

df_train = pd.read_csv(PATH+'train.csv')
df_train_ibc = df_train[(df_train.category == "CULTURE") & ((df_train.host == "english.stackexchange.com") | (df_train.host == "ell.stackexchange.com"))]
df_test = pd.read_csv(PATH+'test.csv')
df_test_ibc = df_test[(df_test.category == "CULTURE") & ((df_test.host == "english.stackexchange.com") | (df_test.host == "ell.stackexchange.com"))]
df_sub = pd.read_csv(PATH+'sample_submission.csv')
print('train shape =', df_train.shape)
print('test shape =', df_test.shape)

output_categories_ibc = ['question_type_spelling']
output_categories_main = [x for x in list(df_train.columns[11:]) if x not in output_categories_ibc]
input_categories = list(df_train.columns[[1,2,5]])
print('\nmain output categories:\n\t', output_categories_main)
print('\noutput categories of imbalanced columns:\n\t', output_categories_ibc)
print('\ninput categories:\n\t', input_categories)

train shape = (6079, 41)
test shape = (476, 11)

main output categories:
	 ['question_asker_intent_understanding', 'question_body_critical', 'question_conversational', 'question_expect_short_answer', 'question_fact_seeking', 'question_has_commonly_accepted_answer', 'question_interestingness_others', 'question_interestingness_self', 'question_multi_intent', 'question_not_really_a_question', 'question_opinion_seeking', 'question_type_choice', 'question_type_compare', 'question_type_consequence', 'question_type_definition', 'question_type_entity', 'question_type_instructions', 'question_type_procedure', 'question_type_reason_explanation', 'question_well_written', 'answer_helpful', 'answer_level_of_information', 'answer_plausible', 'answer_relevance', 'answer_satisfaction', 'answer_type_instructions', 'answer_type_procedure', 'answer_type_reason_explanation', 'answer_well_written']

output categories of imbalanced columns:
	 ['question_type_spelling']

input categories:
	 ['question_title'

#### 2. Preprocessing functions

These are some functions that will be used to preprocess the raw text data into useable Bert inputs.

In [4]:
def _get_masks(tokens, max_seq_length):
    """Mask for padding"""
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    return [1]*len(tokens) + [0] * (max_seq_length - len(tokens))

def _get_segments(tokens, max_seq_length):
    """Segments: 0 for the first sequence, 1 for the second"""
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    segments = []
    first_sep = True
    current_segment_id = 0
    for token in tokens:
        segments.append(current_segment_id)
        if token == "[SEP]":
            if first_sep:
                first_sep = False 
            else:
                current_segment_id = 1
    return segments + [0] * (max_seq_length - len(tokens))

def _get_ids(tokens, tokenizer, max_seq_length):
    """Token ids from Tokenizer vocab"""
    token_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_ids = token_ids + [0] * (max_seq_length-len(token_ids))
    return input_ids

def _trim_input(title, question, answer, max_sequence_length, 
                t_max_len=30, q_max_len=239, a_max_len=239):

    t = tokenizer.tokenize(title)
    q = tokenizer.tokenize(question)
    a = tokenizer.tokenize(answer)
    
    t_len = len(t)
    q_len = len(q)
    a_len = len(a)

    if (t_len+q_len+a_len+4) > max_sequence_length:
        
        if t_max_len > t_len:
            t_new_len = t_len
            a_max_len = a_max_len + floor((t_max_len - t_len)/2)
            q_max_len = q_max_len + ceil((t_max_len - t_len)/2)
        else:
            t_new_len = t_max_len
      
        if a_max_len > a_len:
            a_new_len = a_len 
            q_new_len = q_max_len + (a_max_len - a_len)
        elif q_max_len > q_len:
            a_new_len = a_max_len + (q_max_len - q_len)
            q_new_len = q_len
        else:
            a_new_len = a_max_len
            q_new_len = q_max_len
            
            
        if t_new_len+a_new_len+q_new_len+4 != max_sequence_length:
            raise ValueError("New sequence length should be %d, but is %d" 
                             % (max_sequence_length, (t_new_len+a_new_len+q_new_len+4)))
        
        t = t[:t_new_len]
        q = q[:q_new_len]
        a = a[:a_new_len]
    
    return t, q, a

def _convert_to_bert_inputs(title, question, answer, tokenizer, max_sequence_length):
    """Converts tokenized input to ids, masks and segments for BERT"""
    
    stoken = ["[CLS]"] + title + ["[SEP]"] + question + ["[SEP]"] + answer + ["[SEP]"]

    input_ids = _get_ids(stoken, tokenizer, max_sequence_length)
    input_masks = _get_masks(stoken, max_sequence_length)
    input_segments = _get_segments(stoken, max_sequence_length)

    return [input_ids, input_masks, input_segments]

def compute_input_arrays(df, columns, tokenizer, max_sequence_length):
    input_ids, input_masks, input_segments = [], [], []
    for _, instance in tqdm(df[columns].iterrows()):
        t, q, a = instance.question_title, instance.question_body, instance.answer

        t, q, a = _trim_input(t, q, a, max_sequence_length)

        ids, masks, segments = _convert_to_bert_inputs(t, q, a, tokenizer, max_sequence_length)
        input_ids.append(ids)
        input_masks.append(masks)
        input_segments.append(segments)
        
    return [np.asarray(input_ids, dtype=np.int32), 
            np.asarray(input_masks, dtype=np.int32), 
            np.asarray(input_segments, dtype=np.int32)]


def compute_output_arrays(df, columns):
    return np.asarray(df[columns])

#### 3. Create model

`compute_spearmanr()` is used to compute the competition metric for the validation set
<br><br>
`CustomCallback()` is a class which inherits from `tf.keras.callbacks.Callback` and will compute and append validation score and validation/test predictions respectively, after each epoch.
<br><br>
`bert_model()` contains the actual architecture that will be used to finetune BERT to our dataset. It's simple, just taking the sequence_output of the bert_layer and pass it to an AveragePooling layer and finally to an output layer of 30 units (30 classes that we have to predict)
<br><br>
`train_and_predict()` this function will be run to train and obtain predictions

In [5]:
def compute_spearmanr(trues, preds):
    rhos = []
    for col_trues, col_pred in zip(trues.T, preds.T):
        rhos.append(
            spearmanr(col_trues, col_pred + np.random.normal(0, 1e-7, col_pred.shape[0])).correlation)
    return np.mean(rhos)


class CustomCallback(tf.keras.callbacks.Callback):
    
    def __init__(self, valid_data, test_data, batch_size=16, fold=None):

        self.valid_inputs = valid_data[0]
        self.valid_outputs = valid_data[1]
        self.test_inputs = test_data
        
        self.batch_size = batch_size
        self.fold = fold
        
    def on_train_begin(self, logs={}):
        self.valid_predictions = []
        self.test_predictions = []
        
    def on_epoch_end(self, epoch, logs={}):
        self.valid_predictions.append(
            self.model.predict(self.valid_inputs, batch_size=self.batch_size))
        
        rho_val = compute_spearmanr(
            self.valid_outputs, np.average(self.valid_predictions, axis=0))
        
        print("\nvalidation rho: %.4f" % rho_val)
        
        if self.fold is not None:
            self.model.save_weights(f'bert-base-{fold}-{epoch}.h5py')
        
        self.test_predictions.append(
            self.model.predict(self.test_inputs, batch_size=self.batch_size)
        )

def bert_model(out_num):
    
    input_word_ids = tf.keras.layers.Input(
        (MAX_SEQUENCE_LENGTH,), dtype=tf.int32, name='input_word_ids')
    input_masks = tf.keras.layers.Input(
        (MAX_SEQUENCE_LENGTH,), dtype=tf.int32, name='input_masks')
    input_segments = tf.keras.layers.Input(
        (MAX_SEQUENCE_LENGTH,), dtype=tf.int32, name='input_segments')
    
    bert_layer = hub.KerasLayer(BERT_PATH, trainable=True)
    
    _, sequence_output = bert_layer([input_word_ids, input_masks, input_segments])
    
    x = tf.keras.layers.GlobalAveragePooling1D()(sequence_output)
    x = tf.keras.layers.Dropout(0.2)(x)
    out = tf.keras.layers.Dense(out_num, activation="sigmoid", name="dense_output")(x)

    model = tf.keras.models.Model(
        inputs=[input_word_ids, input_masks, input_segments], outputs=out)
    
    return model    
        
def train_and_predict(model, train_data, valid_data, test_data, 
                      learning_rate, epochs, batch_size, loss_function, fold):
        
    custom_callback = CustomCallback(
        valid_data=(valid_data[0], valid_data[1]), 
        test_data=test_data,
        batch_size=batch_size,
        fold=None)

    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    model.compile(loss=loss_function, optimizer=optimizer)
    model.fit(train_data[0], train_data[1], epochs=epochs, 
              batch_size=batch_size, callbacks=[custom_callback])
    
    return custom_callback


#### 4. Obtain inputs and targets, as well as the indices of the train/validation splits

In [6]:
gkf = GroupKFold(n_splits=10).split(X=df_train.question_body, groups=df_train.question_body) ############## originaln_splits=5

outputs_main = compute_output_arrays(df_train, output_categories_main)
outputs_ibc = compute_output_arrays(df_train_ibc, output_categories_ibc)
inputs_main = compute_input_arrays(df_train, input_categories, tokenizer, MAX_SEQUENCE_LENGTH)
inputs_ibc = compute_input_arrays(df_train_ibc, input_categories, tokenizer, MAX_SEQUENCE_LENGTH)
test_inputs_main = compute_input_arrays(df_test, input_categories, tokenizer, MAX_SEQUENCE_LENGTH)
test_inputs_ibc = compute_input_arrays(df_test_ibc, input_categories, tokenizer, MAX_SEQUENCE_LENGTH)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




#### 5. Training, validation and testing

Loops over the folds in gkf and trains each fold for 5 epochs --- with a learning rate of 1e-5 and batch_size of 8. A simple binary crossentropy is used as the objective-/loss-function. 


histories = []
for fold, (train_idx, valid_idx) in enumerate(gkf):
    
    # will actually only do 3 folds (out of 5) to manage < 2h
    if fold >= 3:
        K.clear_session()
        model = bert_model(29)

        train_inputs = [inputs_main[i][train_idx] for i in range(3)]
        train_outputs = outputs_main[train_idx]

        valid_inputs = [inputs_main[i][valid_idx] for i in range(3)]
        valid_outputs = outputs_main[valid_idx]

        # history contains two lists of valid and test preds respectively:
        #  [valid_predictions_{fold}, test_predictions_{fold}]
        history = train_and_predict(model, 
                          train_data=(train_inputs, train_outputs), 
                          valid_data=(valid_inputs, valid_outputs),
                          test_data=test_inputs_main, 
                          learning_rate=3e-5, epochs=5, batch_size=8,
                          loss_function='binary_crossentropy', fold=fold)

        histories.append(history)
        model.save_weights(f'bert-main-{fold}.h5')


In [7]:
import os
print(os.listdir("../input/bert-main/"))

['bert-main-7.h5', 'bert-main-1.h5', 'bert-main-8.h5', 'bert-main-5.h5', 'bert-main-6.h5', 'bert-main-4.h5', 'bert-main-2.h5', 'bert-main-3.h5', 'bert-main-0.h5']


In [8]:
# Load trained models
model_path = [f'../input/bert-main/bert-main-0.h5', f'../input/bert-main/bert-main-1.h5', f'../input/bert-main/bert-main-2.h5', f'../input/bert-main/bert-main-3.h5', f'../input/bert-main/bert-main-4.h5', f'../input/bert-main/bert-main-5.h5', f'../input/bert-main/bert-main-6.h5', f'../input/bert-main/bert-main-7.h5', f'../input/bert-main/bert-main-8.h5']
models = []

for i in range(len(model_path)):
    mp = model_path[i]
    model = bert_model(29)
    model.load_weights(mp)
    models.append(model)

In [9]:
# Predict the labels of seperately train set
test_pred_main = []

for model in models:
    test_pred_main.append(model.predict(test_inputs_main, batch_size=8, verbose=1))

final_predictions_main = np.mean(test_pred_main, axis=0)
df_test_pred_main = pd.DataFrame(data=final_predictions_main, columns=output_categories_main)
df_test_pred_main



Unnamed: 0,question_asker_intent_understanding,question_body_critical,question_conversational,question_expect_short_answer,question_fact_seeking,question_has_commonly_accepted_answer,question_interestingness_others,question_interestingness_self,question_multi_intent,question_not_really_a_question,...,question_well_written,answer_helpful,answer_level_of_information,answer_plausible,answer_relevance,answer_satisfaction,answer_type_instructions,answer_type_procedure,answer_type_reason_explanation,answer_well_written
0,0.929999,0.625284,0.210023,0.444978,0.523696,0.531140,0.675310,0.664760,0.616019,0.004657,...,0.892150,0.900712,0.576056,0.961887,0.964484,0.818321,0.021259,0.018069,0.945276,0.906907
1,0.880081,0.503621,0.005112,0.873317,0.751443,0.964624,0.566802,0.431049,0.040762,0.004229,...,0.658749,0.956269,0.657250,0.976231,0.985475,0.881382,0.942901,0.105102,0.036010,0.898183
2,0.900544,0.661850,0.012635,0.843893,0.855036,0.971658,0.574571,0.461402,0.149685,0.004284,...,0.833605,0.933825,0.584210,0.967829,0.969317,0.806190,0.084293,0.073981,0.942322,0.911592
3,0.884594,0.425525,0.005111,0.693434,0.787898,0.904945,0.564193,0.435626,0.120536,0.010162,...,0.724172,0.961425,0.669053,0.980672,0.986820,0.912806,0.849769,0.113941,0.774162,0.910523
4,0.933189,0.372432,0.045524,0.884907,0.725809,0.891901,0.659618,0.614477,0.055709,0.015054,...,0.638898,0.923703,0.641121,0.962308,0.966669,0.824205,0.180326,0.135201,0.550485,0.864580
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
471,0.877715,0.544082,0.006741,0.733175,0.872214,0.958463,0.562008,0.488835,0.047584,0.004307,...,0.705905,0.967019,0.718791,0.975117,0.992918,0.935778,0.955097,0.098312,0.023019,0.902882
472,0.841921,0.508345,0.013620,0.741058,0.796748,0.832091,0.530476,0.402255,0.035211,0.006551,...,0.619870,0.922328,0.663867,0.943712,0.959998,0.854798,0.837499,0.152317,0.123884,0.893590
473,0.762759,0.361513,0.012870,0.621391,0.812363,0.835770,0.506056,0.401189,0.607175,0.005730,...,0.699024,0.927550,0.605199,0.959574,0.967454,0.813231,0.620038,0.150512,0.707604,0.909751
474,0.895410,0.745821,0.018456,0.950083,0.918643,0.987361,0.557040,0.401327,0.025770,0.003355,...,0.865890,0.987169,0.748287,0.990015,0.993794,0.967967,0.085507,0.142673,0.864616,0.928715


test_pred_main = [histories[i].test_predictions for i in range(len(histories))]
test_pred_main = [np.average(test_pred_main[i], axis=0) for i in range(len(test_pred_main))]
test_pred_main = np.mean(test_pred_main, axis=0)
df_test_pred_main = pd.DataFrame(data=test_pred_main, columns=output_categories_main)
df_test_pred_main

## Train the Imbalanced Column (question_type_spelling)

gkf_ibc = GroupKFold(n_splits=10).split(X=df_train_ibc.question_body, groups=df_train_ibc.question_body)

histories_ibc = []
for fold, (train_idx, valid_idx) in enumerate(gkf_ibc):
    
    # will actually only do 3 folds (out of 5) to manage < 2h
    if fold >= 3:
        K.clear_session()
        model = bert_model(1)

        train_inputs = [inputs_ibc[i][train_idx] for i in range(3)]
        train_outputs = outputs_ibc[train_idx]

        valid_inputs = [inputs_ibc[i][valid_idx] for i in range(3)]
        valid_outputs = outputs_ibc[valid_idx]

        # history contains two lists of valid and test preds respectively:
        #  [valid_predictions_{fold}, test_predictions_{fold}]
        history = train_and_predict(model, 
                          train_data=(train_inputs, train_outputs), 
                          valid_data=(valid_inputs, valid_outputs),
                          test_data=test_inputs_ibc, 
                          learning_rate=3e-5, epochs=5, batch_size=8,
                          loss_function='binary_crossentropy', fold=fold)

        histories_ibc.append(history)
        model.save_weights(f'bert-ibc-{fold}.h5')


test_pred_ibc = [histories_ibc[i].test_predictions for i in range(len(histories_ibc))]
test_pred_ibc = [np.average(test_pred_ibc[i], axis=0) for i in range(len(test_pred_ibc))]
test_pred_ibc = np.mean(test_pred_ibc, axis=0)
df_test_pred_ibc = df_test_ibc.reset_index().join(pd.DataFrame(test_pred_ibc))
df_test_pred_ibc = df_test_pred_ibc[["qa_id",0]]
df_test_pred_ibc.columns = ["qa_id","question_type_spelling"]

df_test_pred_ibc_rest = df_test[df_test.qa_id.isin(list(set(df_test.qa_id)-set(df_test_ibc.qa_id)))]
df_test_pred_ibc_rest['question_type_spelling'] = [0]*len(df_test_pred_ibc_rest)
df_test_pred_ibc_rest = df_test_pred_ibc_rest[['qa_id','question_type_spelling']]

df_test_pred_ibc = pd.concat([df_test_pred_ibc, df_test_pred_ibc_rest]).sort_values(by='qa_id')
df_test_pred_ibc = df_test_pred_ibc[["question_type_spelling"]]
df_test_pred_ibc

In [10]:
# set all culture and english/ell host to 1
df_test_pred_ibc = df_test[df_test.qa_id.isin(df_test_ibc.qa_id)]
df_test_pred_ibc['question_type_spelling'] = [1]*len(df_test_pred_ibc)
df_test_pred_ibc = df_test_pred_ibc[['qa_id','question_type_spelling']]

df_test_pred_ibc_rest = df_test[df_test.qa_id.isin(list(set(df_test.qa_id)-set(df_test_ibc.qa_id)))]
df_test_pred_ibc_rest['question_type_spelling'] = [0]*len(df_test_pred_ibc_rest)
df_test_pred_ibc_rest = df_test_pred_ibc_rest[['qa_id','question_type_spelling']]

df_test_pred_ibc = pd.concat([df_test_pred_ibc, df_test_pred_ibc_rest]).sort_values(by='qa_id')
df_test_pred_ibc = df_test_pred_ibc[["question_type_spelling"]]
df_test_pred_ibc

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


Unnamed: 0,question_type_spelling
0,0
1,0
2,0
3,0
4,0
...,...
471,0
472,0
473,0
474,0


#### 6. Process and submit test predictions

First the test predictions are read from the list of lists of `histories`. Then each test prediction list (in lists) is averaged. Then a mean of the averages is computed to get a single prediction for each data point. Finally, this is saved to `submission.csv`

In [11]:
for i in output_categories_main:
    df_sub[i] = df_test_pred_main[i]

df_sub = df_sub.reset_index(drop=True)
df_test_pred_ibc = df_test_pred_ibc.reset_index(drop=True)
for i in output_categories_ibc:
    df_sub[i] = df_test_pred_ibc[i]
    
df_sub
df_sub.to_csv('submission.csv', index=False)