In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GroupKFold
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import tensorflow_hub as hub
import tensorflow as tf
import bert.tokenization as tokenization
import tensorflow.keras.backend as K
import gc
import os
from scipy.stats import spearmanr
from math import floor, ceil
from tensorflow.keras.models import load_model

np.set_printoptions(suppress=True)

The following cell is for bert_tokenization package

In [7]:
import collections
import re
import unicodedata
import six
import tensorflow as tf


def validate_case_matches_checkpoint(do_lower_case, init_checkpoint):
    """Checks whether the casing config is consistent with the checkpoint name."""

    # The casing has to be passed in by the user and there is no explicit check
    # as to whether it matches the checkpoint. The casing information probably
    # should have been stored in the bert_config.json file, but it's not, so
    # we have to heuristically detect it to validate.

    if not init_checkpoint:
        return

    m = re.match("^.*?([A-Za-z0-9_-]+)/bert_model.ckpt", init_checkpoint)
    if m is None:
        return

    model_name = m.group(1)

    lower_models = [
        "uncased_L-24_H-1024_A-16", "uncased_L-12_H-768_A-12",
        "multilingual_L-12_H-768_A-12", "chinese_L-12_H-768_A-12"
    ]

    cased_models = [
        "cased_L-12_H-768_A-12", "cased_L-24_H-1024_A-16",
        "multi_cased_L-12_H-768_A-12"
    ]

    is_bad_config = False
    if model_name in lower_models and not do_lower_case:
        is_bad_config = True
        actual_flag = "False"
        case_name = "lowercased"
        opposite_flag = "True"

    if model_name in cased_models and do_lower_case:
        is_bad_config = True
        actual_flag = "True"
        case_name = "cased"
        opposite_flag = "False"

    if is_bad_config:
        raise ValueError(
            "You passed in `--do_lower_case=%s` with `--init_checkpoint=%s`. "
            "However, `%s` seems to be a %s model, so you "
            "should pass in `--do_lower_case=%s` so that the fine-tuning matches "
            "how the model was pre-training. If this error is wrong, please "
            "just comment out this check." % (actual_flag, init_checkpoint,
                                              model_name, case_name, opposite_flag))


def convert_to_unicode(text):
    """Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
    if six.PY3:
        if isinstance(text, str):
            return text
        elif isinstance(text, bytes):
            return text.decode("utf-8", "ignore")
        else:
            raise ValueError("Unsupported string type: %s" % (type(text)))
    elif six.PY2:
        if isinstance(text, str):
            return text.decode("utf-8", "ignore")
        elif isinstance(text, unicode):
            return text
        else:
            raise ValueError("Unsupported string type: %s" % (type(text)))
    else:
        raise ValueError("Not running on Python2 or Python 3?")


def printable_text(text):
    """Returns text encoded in a way suitable for print or `tf.logging`."""

    # These functions want `str` for both Python2 and Python3, but in one case
    # it's a Unicode string and in the other it's a byte string.
    if six.PY3:
        if isinstance(text, str):
            return text
        elif isinstance(text, bytes):
            return text.decode("utf-8", "ignore")
        else:
            raise ValueError("Unsupported string type: %s" % (type(text)))
    elif six.PY2:
        if isinstance(text, str):
            return text
        elif isinstance(text, unicode):
            return text.encode("utf-8")
        else:
            raise ValueError("Unsupported string type: %s" % (type(text)))
    else:
        raise ValueError("Not running on Python2 or Python 3?")


def load_vocab(vocab_file):
    """Loads a vocabulary file into a dictionary."""
    vocab = collections.OrderedDict()
    index = 0
    with tf.io.gfile.GFile(vocab_file, "r") as reader:
        while True:
            token = convert_to_unicode(reader.readline())
            if not token:
                break
            token = token.strip()
            vocab[token] = index
            index += 1
    return vocab


def convert_by_vocab(vocab, items):
    """Converts a sequence of [tokens|ids] using the vocab."""
    output = []
    for item in items:
        output.append(vocab[item])
    return output


def convert_tokens_to_ids(vocab, tokens):
    return convert_by_vocab(vocab, tokens)


def convert_ids_to_tokens(inv_vocab, ids):
    return convert_by_vocab(inv_vocab, ids)


def whitespace_tokenize(text):
    """Runs basic whitespace cleaning and splitting on a piece of text."""
    text = text.strip()
    if not text:
        return []
    tokens = text.split()
    return tokens


class FullTokenizer(object):
    """Runs end-to-end tokenziation."""

    def __init__(self, vocab_file, do_lower_case=True):
        self.vocab = load_vocab(vocab_file)
        self.inv_vocab = {v: k for k, v in self.vocab.items()}
        self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)

    def tokenize(self, text):
        split_tokens = []
        for token in self.basic_tokenizer.tokenize(text):
            for sub_token in self.wordpiece_tokenizer.tokenize(token):
                split_tokens.append(sub_token)

        return split_tokens

    def convert_tokens_to_ids(self, tokens):
        return convert_by_vocab(self.vocab, tokens)

    def convert_ids_to_tokens(self, ids):
        return convert_by_vocab(self.inv_vocab, ids)


class BasicTokenizer(object):
    """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""

    def __init__(self, do_lower_case=True):
        """Constructs a BasicTokenizer.
        Args:
          do_lower_case: Whether to lower case the input.
        """
        self.do_lower_case = do_lower_case

    def tokenize(self, text):
        """Tokenizes a piece of text."""
        text = convert_to_unicode(text)
        text = self._clean_text(text)

        # This was added on November 1st, 2018 for the multilingual and Chinese
        # models. This is also applied to the English models now, but it doesn't
        # matter since the English models were not trained on any Chinese data
        # and generally don't have any Chinese data in them (there are Chinese
        # characters in the vocabulary because Wikipedia does have some Chinese
        # words in the English Wikipedia.).
        text = self._tokenize_chinese_chars(text)

        orig_tokens = whitespace_tokenize(text)
        split_tokens = []
        for token in orig_tokens:
            if self.do_lower_case:
                token = token.lower()
                token = self._run_strip_accents(token)
            split_tokens.extend(self._run_split_on_punc(token))

        output_tokens = whitespace_tokenize(" ".join(split_tokens))
        return output_tokens

    def _run_strip_accents(self, text):
        """Strips accents from a piece of text."""
        text = unicodedata.normalize("NFD", text)
        output = []
        for char in text:
            cat = unicodedata.category(char)
            if cat == "Mn":
                continue
            output.append(char)
        return "".join(output)

    def _run_split_on_punc(self, text):
        """Splits punctuation on a piece of text."""
        chars = list(text)
        i = 0
        start_new_word = True
        output = []
        while i < len(chars):
            char = chars[i]
            if _is_punctuation(char):
                output.append([char])
                start_new_word = True
            else:
                if start_new_word:
                    output.append([])
                start_new_word = False
                output[-1].append(char)
            i += 1

        return ["".join(x) for x in output]

    def _tokenize_chinese_chars(self, text):
        """Adds whitespace around any CJK character."""
        output = []
        for char in text:
            cp = ord(char)
            if self._is_chinese_char(cp):
                output.append(" ")
                output.append(char)
                output.append(" ")
            else:
                output.append(char)
        return "".join(output)

    def _is_chinese_char(self, cp):
        """Checks whether CP is the codepoint of a CJK character."""
        # This defines a "chinese character" as anything in the CJK Unicode block:
        #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
        #
        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
        # despite its name. The modern Korean Hangul alphabet is a different block,
        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
        # space-separated words, so they are not treated specially and handled
        # like the all of the other languages.
        if ((cp >= 0x4E00 and cp <= 0x9FFF) or  #
                (cp >= 0x3400 and cp <= 0x4DBF) or  #
                (cp >= 0x20000 and cp <= 0x2A6DF) or  #
                (cp >= 0x2A700 and cp <= 0x2B73F) or  #
                (cp >= 0x2B740 and cp <= 0x2B81F) or  #
                (cp >= 0x2B820 and cp <= 0x2CEAF) or
                (cp >= 0xF900 and cp <= 0xFAFF) or  #
                (cp >= 0x2F800 and cp <= 0x2FA1F)):  #
            return True

        return False

    def _clean_text(self, text):
        """Performs invalid character removal and whitespace cleanup on text."""
        output = []
        for char in text:
            cp = ord(char)
            if cp == 0 or cp == 0xfffd or _is_control(char):
                continue
            if _is_whitespace(char):
                output.append(" ")
            else:
                output.append(char)
        return "".join(output)


class WordpieceTokenizer(object):
    """Runs WordPiece tokenziation."""

    def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=200):
        self.vocab = vocab
        self.unk_token = unk_token
        self.max_input_chars_per_word = max_input_chars_per_word

    def tokenize(self, text):
        """Tokenizes a piece of text into its word pieces.
        This uses a greedy longest-match-first algorithm to perform tokenization
        using the given vocabulary.
        For example:
          input = "unaffable"
          output = ["un", "##aff", "##able"]
        Args:
          text: A single token or whitespace separated tokens. This should have
            already been passed through `BasicTokenizer.
        Returns:
          A list of wordpiece tokens.
        """

        text = convert_to_unicode(text)

        output_tokens = []
        for token in whitespace_tokenize(text):
            chars = list(token)
            if len(chars) > self.max_input_chars_per_word:
                output_tokens.append(self.unk_token)
                continue

            is_bad = False
            start = 0
            sub_tokens = []
            while start < len(chars):
                end = len(chars)
                cur_substr = None
                while start < end:
                    substr = "".join(chars[start:end])
                    if start > 0:
                        substr = "##" + substr
                    if substr in self.vocab:
                        cur_substr = substr
                        break
                    end -= 1
                if cur_substr is None:
                    is_bad = True
                    break
                sub_tokens.append(cur_substr)
                start = end

            if is_bad:
                output_tokens.append(self.unk_token)
            else:
                output_tokens.extend(sub_tokens)
        return output_tokens


def _is_whitespace(char):
    """Checks whether `chars` is a whitespace character."""
    # \t, \n, and \r are technically contorl characters but we treat them
    # as whitespace since they are generally considered as such.
    if char == " " or char == "\t" or char == "\n" or char == "\r":
        return True
    cat = unicodedata.category(char)
    if cat == "Zs":
        return True
    return False


def _is_control(char):
    """Checks whether `chars` is a control character."""
    # These are technically control characters but we count them as whitespace
    # characters.
    if char == "\t" or char == "\n" or char == "\r":
        return False
    cat = unicodedata.category(char)
    if cat in ("Cc", "Cf"):
        return True
    return False


def _is_punctuation(char):
    """Checks whether `chars` is a punctuation character."""
    cp = ord(char)
    # We treat all non-letter/number ASCII as punctuation.
    # Characters such as "^", "$", and "`" are not in the Unicode
    # Punctuation class but we treat them as punctuation anyways, for
    # consistency.
    if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
            (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
        return True
    cat = unicodedata.category(char)
    if cat.startswith("P"):
        return True
    return False

# Bert-based Model

In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GroupKFold
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import tensorflow_hub as hub
import tensorflow as tf
#import bert_tokenization as tokenization
import tensorflow.keras.backend as K
import gc
import os
from scipy.stats import spearmanr
from math import floor, ceil
from tensorflow.keras.models import load_model

np.set_printoptions(suppress=True)

In [12]:
# Import data
PATH = 'data/'
BERT_PATH = 'bert/'
tokenizer = FullTokenizer(BERT_PATH+'vocab.txt', True)
MAX_SEQUENCE_LENGTH = 512

df_train = pd.read_csv(PATH+'train.csv')
df_test = pd.read_csv(PATH+'test.csv')
df_sub = pd.read_csv(PATH+'sample_submission.csv')
print('train shape =', df_train.shape)
print('test shape =', df_test.shape)

output_categories = list(df_train.columns[11:])
input_categories = list(df_train.columns[[1,2,5]])
print('\noutput categories:\n\t', output_categories)
print('\ninput categories:\n\t', input_categories)

train shape = (6079, 41)
test shape = (476, 11)

output categories:
	 ['question_asker_intent_understanding', 'question_body_critical', 'question_conversational', 'question_expect_short_answer', 'question_fact_seeking', 'question_has_commonly_accepted_answer', 'question_interestingness_others', 'question_interestingness_self', 'question_multi_intent', 'question_not_really_a_question', 'question_opinion_seeking', 'question_type_choice', 'question_type_compare', 'question_type_consequence', 'question_type_definition', 'question_type_entity', 'question_type_instructions', 'question_type_procedure', 'question_type_reason_explanation', 'question_type_spelling', 'question_well_written', 'answer_helpful', 'answer_level_of_information', 'answer_plausible', 'answer_relevance', 'answer_satisfaction', 'answer_type_instructions', 'answer_type_procedure', 'answer_type_reason_explanation', 'answer_well_written']

input categories:
	 ['question_title', 'question_body', 'answer']


In [13]:
def _get_masks(tokens, max_seq_length):
    """Mask for padding"""
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    return [1]*len(tokens) + [0] * (max_seq_length - len(tokens))

def _get_segments(tokens, max_seq_length):
    """Segments: 0 for the first sequence, 1 for the second"""
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    segments = []
    first_sep = True
    current_segment_id = 0
    for token in tokens:
        segments.append(current_segment_id)
        if token == "[SEP]":
            if first_sep:
                first_sep = False 
            else:
                current_segment_id = 1
    return segments + [0] * (max_seq_length - len(tokens))

def _get_ids(tokens, tokenizer, max_seq_length):
    """Token ids from Tokenizer vocab"""
    token_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_ids = token_ids + [0] * (max_seq_length-len(token_ids))
    return input_ids

def _trim_input(title, question, answer, max_sequence_length, 
                t_max_len=30, q_max_len=239, a_max_len=239):

    t = tokenizer.tokenize(title)
    q = tokenizer.tokenize(question)
    a = tokenizer.tokenize(answer)
    
    t_len = len(t)
    q_len = len(q)
    a_len = len(a)

    if (t_len+q_len+a_len+4) > max_sequence_length:
        
        if t_max_len > t_len:
            t_new_len = t_len
            a_max_len = a_max_len + floor((t_max_len - t_len)/2)
            q_max_len = q_max_len + ceil((t_max_len - t_len)/2)
        else:
            t_new_len = t_max_len
      
        if a_max_len > a_len:
            a_new_len = a_len 
            q_new_len = q_max_len + (a_max_len - a_len)
        elif q_max_len > q_len:
            a_new_len = a_max_len + (q_max_len - q_len)
            q_new_len = q_len
        else:
            a_new_len = a_max_len
            q_new_len = q_max_len
            
            
        if t_new_len+a_new_len+q_new_len+4 != max_sequence_length:
            raise ValueError("New sequence length should be %d, but is %d" 
                             % (max_sequence_length, (t_new_len+a_new_len+q_new_len+4)))
        
        t = t[:t_new_len]
        q = q[:q_new_len]
        a = a[:a_new_len]
    
    return t, q, a

def _convert_to_bert_inputs(title, question, answer, tokenizer, max_sequence_length):
    """Converts tokenized input to ids, masks and segments for BERT"""
    
    stoken = ["[CLS]"] + title + ["[SEP]"] + question + ["[SEP]"] + answer + ["[SEP]"]

    input_ids = _get_ids(stoken, tokenizer, max_sequence_length)
    input_masks = _get_masks(stoken, max_sequence_length)
    input_segments = _get_segments(stoken, max_sequence_length)

    return [input_ids, input_masks, input_segments]

def compute_input_arays(df, columns, tokenizer, max_sequence_length):
    input_ids, input_masks, input_segments = [], [], []
    for _, instance in tqdm(df[columns].iterrows()):
        t, q, a = instance.question_title, instance.question_body, instance.answer

        t, q, a = _trim_input(t, q, a, max_sequence_length)

        ids, masks, segments = _convert_to_bert_inputs(t, q, a, tokenizer, max_sequence_length)
        input_ids.append(ids)
        input_masks.append(masks)
        input_segments.append(segments)
        
    return [np.asarray(input_ids, dtype=np.int32), 
            np.asarray(input_masks, dtype=np.int32), 
            np.asarray(input_segments, dtype=np.int32)]


def compute_output_arrays(df, columns):
    return np.asarray(df[columns])

In [14]:
def compute_spearmanr(trues, preds):
    rhos = []
    for col_trues, col_pred in zip(trues.T, preds.T):
        rhos.append(
            spearmanr(col_trues, col_pred + np.random.normal(0, 1e-7, col_pred.shape[0])).correlation)
    return np.mean(rhos)


class CustomCallback(tf.keras.callbacks.Callback):
    
    def __init__(self, valid_data, test_data, batch_size=16, fold=None):

        self.valid_inputs = valid_data[0]
        self.valid_outputs = valid_data[1]
        self.test_inputs = test_data
        
        self.batch_size = batch_size
        self.fold = fold
        
    def on_train_begin(self, logs={}):
        self.valid_predictions = []
        self.test_predictions = []
        
    def on_epoch_end(self, epoch, logs={}):
        self.valid_predictions.append(
            self.model.predict(self.valid_inputs, batch_size=self.batch_size))
        
        rho_val = compute_spearmanr(
            self.valid_outputs, np.average(self.valid_predictions, axis=0))
        
        print("\nvalidation rho: %.4f" % rho_val)
        
        if self.fold is not None:
            self.model.save_weights(f'bert-base-{fold}-{epoch}.h5py')
        
        self.test_predictions.append(
            self.model.predict(self.test_inputs, batch_size=self.batch_size)
        )

def bert_model():
    
    input_word_ids = tf.keras.layers.Input(
        (MAX_SEQUENCE_LENGTH,), dtype=tf.int32, name='input_word_ids')
    input_masks = tf.keras.layers.Input(
        (MAX_SEQUENCE_LENGTH,), dtype=tf.int32, name='input_masks')
    input_segments = tf.keras.layers.Input(
        (MAX_SEQUENCE_LENGTH,), dtype=tf.int32, name='input_segments')
    
    bert_layer = hub.KerasLayer(BERT_PATH, trainable=True)
    
    _, sequence_output = bert_layer([input_word_ids, input_masks, input_segments])
    
    x = tf.keras.layers.GlobalAveragePooling1D()(sequence_output)
    x = tf.keras.layers.Dropout(0.2)(x)
    out = tf.keras.layers.Dense(30, activation="sigmoid", name="dense_output")(x)

    model = tf.keras.models.Model(
        inputs=[input_word_ids, input_masks, input_segments], outputs=out)
    
    return model    
        
def train_and_predict(model, train_data, valid_data, test_data, 
                      learning_rate, epochs, batch_size, loss_function, fold):
        
    custom_callback = CustomCallback(
        valid_data=(valid_data[0], valid_data[1]), 
        test_data=test_data,
        batch_size=batch_size,
        fold=None)

    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    model.compile(loss=loss_function, optimizer=optimizer)
    model.fit(train_data[0], train_data[1], epochs=epochs, 
              batch_size=batch_size, callbacks=[custom_callback])
    
    return custom_callback

In [15]:
gkf = GroupKFold(n_splits=10).split(X=df_train.question_body, groups=df_train.question_body) ############## originaln_splits=5

#outputs = compute_output_arrays(df_train, output_categories)
#inputs = compute_input_arays(df_train, input_categories, tokenizer, MAX_SEQUENCE_LENGTH)
test_inputs = compute_input_arays(df_test, input_categories, tokenizer, MAX_SEQUENCE_LENGTH)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




## Check the performance of the model on each column

In [82]:
# Split dataset for validation from the train set
df_valid = df_train.sample(n=476)
df_valid_y = df_valid[df_valid.columns[11:]]
df_valid_x = df_valid[df_valid.columns[:11]]

# Convert dataset to input
valid_inputs = compute_input_arays(df_valid_x, input_categories, tokenizer, MAX_SEQUENCE_LENGTH)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [123]:
df_valid_x.head()

Unnamed: 0,qa_id,question_title,question_body,question_user_name,question_user_page,answer,answer_user_name,answer_user_page,url,category,host
2780,4429,Dimension of a quotient ring,"\n What is the Krull dimension of $B=A[x,y,z]...",Zoey,https://math.stackexchange.com/users/234074,"Your first ring $B$ is just $A[x^{\pm1}]$, the...",Mariano Suárez-Álvarez,https://math.stackexchange.com/users/274,http://math.stackexchange.com/questions/138610...,SCIENCE,math.stackexchange.com
6013,9539,Is there a way to read the columns in a shapef...,"My shapefile has a number of fields, like road...",patrick,https://gis.stackexchange.com/users/733,"Yes, a shapefile in this context is a feature ...",Petr Krebs,https://gis.stackexchange.com/users/434,http://gis.stackexchange.com/questions/4619/is...,TECHNOLOGY,gis.stackexchange.com
2992,4770,Quantity based discount for single product in ...,"I'm using latest build of Expresso Store, 2.3....",neekster,https://expressionengine.stackexchange.com/use...,"Use this bulk discounts add-on, does exactly w...",Peter Lewis,https://expressionengine.stackexchange.com/use...,http://expressionengine.stackexchange.com/ques...,TECHNOLOGY,expressionengine.stackexchange.com
150,238,hide javascript/jquery scripts from html page?,How do I hide my javascript/jquery scripts fro...,Shaitender Singh,https://stackoverflow.com/users/192252,"You can't, sorry. No matter what you do, even ...",Brian Campbell,https://stackoverflow.com/users/69755,http://stackoverflow.com/questions/1628799/hid...,STACKOVERFLOW,stackoverflow.com
1827,2898,Which whois is the most frequently updated?,"As the domain manager of 200+ domains, it's es...",Miss M,https://webmasters.stackexchange.com/users/52761,"Being one who has a similar site, I would have...",closetnoc,https://webmasters.stackexchange.com/users/36029,http://webmasters.stackexchange.com/questions/...,TECHNOLOGY,webmasters.stackexchange.com


In [124]:
df_valid_y.head()

Unnamed: 0,question_asker_intent_understanding,question_body_critical,question_conversational,question_expect_short_answer,question_fact_seeking,question_has_commonly_accepted_answer,question_interestingness_others,question_interestingness_self,question_multi_intent,question_not_really_a_question,...,question_well_written,answer_helpful,answer_level_of_information,answer_plausible,answer_relevance,answer_satisfaction,answer_type_instructions,answer_type_procedure,answer_type_reason_explanation,answer_well_written
2780,0.777778,0.444444,0.0,1.0,1.0,0.666667,0.555556,0.333333,0.0,0.0,...,0.666667,1.0,0.777778,1.0,1.0,0.933333,0.333333,0.0,0.333333,1.0
6013,1.0,1.0,0.0,0.666667,0.333333,1.0,0.444444,0.333333,0.0,0.0,...,0.888889,1.0,0.666667,1.0,1.0,0.933333,0.666667,0.0,0.333333,0.888889
2992,0.888889,0.444444,0.0,1.0,1.0,1.0,0.777778,0.444444,0.666667,0.0,...,0.444444,1.0,0.666667,1.0,1.0,1.0,1.0,0.0,0.0,0.888889
150,1.0,1.0,0.0,0.666667,1.0,0.666667,0.555556,0.444444,0.0,0.0,...,0.888889,1.0,0.777778,1.0,1.0,0.866667,1.0,0.333333,0.666667,1.0
1827,0.777778,0.777778,0.666667,1.0,1.0,1.0,0.666667,0.333333,0.0,0.0,...,1.0,0.888889,0.777778,0.888889,1.0,0.8,0.0,0.0,0.333333,1.0


In [86]:
# Load trained models
model_path = [f'bert/Full-0.h5', f'bert/bertuned_f0.h5', f'bert/bertuned_f1.h5', f'bert/bertuned_f2.h5', f'bert/bertuned_f3.h5', f'bert/bertuned_f4.h5']
models = []

for i in range(len(model_path)):
    mp = model_path[i]
    model = bert_model()
    model.load_weights(mp)
    models.append(model)

In [37]:
len(models)

6

In [88]:
# Predict the labels of validation set
valid_predictions = []

for model in models[:1]:
    valid_predictions.append(model.predict(valid_inputs, batch_size=8, verbose=1))

final_predictions_valid = np.mean(valid_predictions, axis=0)



In [92]:
df_valid_pred = pd.DataFrame(final_predictions_valid)
df_valid_pred

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
0,0.869381,0.491549,0.007663,0.987843,0.982960,0.594069,0.570143,0.319139,0.018902,0.001266,...,0.829597,0.991001,0.713615,0.990622,0.997140,0.957748,0.215083,0.009544,0.214697,0.967283
1,0.987236,0.913054,0.002851,0.818194,0.262798,0.989817,0.405797,0.284093,0.006089,0.002051,...,0.939046,0.993420,0.702525,0.997489,0.997331,0.961701,0.451416,0.018958,0.317158,0.955285
2,0.946362,0.559000,0.005701,0.153690,0.886456,0.964760,0.368762,0.363530,0.134047,0.005127,...,0.689516,0.986197,0.603693,0.992347,0.991689,0.804043,0.963588,0.076932,0.037348,0.901946
3,0.977030,0.689247,0.004422,0.971632,0.898258,0.885676,0.627077,0.393279,0.008055,0.000502,...,0.783575,0.992845,0.656368,0.996941,0.996181,0.927622,0.816781,0.196169,0.906635,0.977425
4,0.981712,0.708787,0.003829,0.979621,0.950203,0.153332,0.711191,0.507949,0.174303,0.000851,...,0.960060,0.970717,0.751022,0.978112,0.988915,0.865172,0.116016,0.242364,0.919507,0.956978
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
471,0.857350,0.769211,0.012732,0.596025,0.964083,0.889377,0.578748,0.362019,0.384986,0.002616,...,0.914520,0.958980,0.584990,0.987119,0.988563,0.855754,0.287681,0.459318,0.971183,0.947662
472,0.959904,0.536496,0.004854,0.860072,0.977445,0.960938,0.607823,0.564451,0.038689,0.001744,...,0.877827,0.943477,0.634282,0.980762,0.994144,0.878152,0.924284,0.350154,0.083588,0.895772
473,0.953137,0.914440,0.005238,0.964677,0.976245,0.984673,0.712662,0.537255,0.004460,0.002185,...,0.920890,0.983903,0.749031,0.996012,0.997102,0.941693,0.579897,0.399180,0.419634,0.970456
474,0.841125,0.376478,0.007780,0.990137,0.961846,0.880656,0.562385,0.458296,0.439309,0.006095,...,0.801846,0.976788,0.760943,0.973384,0.990753,0.902006,0.991708,0.008178,0.040411,0.943300


In [93]:
df_valid_y

Unnamed: 0,question_asker_intent_understanding,question_body_critical,question_conversational,question_expect_short_answer,question_fact_seeking,question_has_commonly_accepted_answer,question_interestingness_others,question_interestingness_self,question_multi_intent,question_not_really_a_question,...,question_well_written,answer_helpful,answer_level_of_information,answer_plausible,answer_relevance,answer_satisfaction,answer_type_instructions,answer_type_procedure,answer_type_reason_explanation,answer_well_written
2780,0.777778,0.444444,0.000000,1.000000,1.000000,0.666667,0.555556,0.333333,0.000000,0.0,...,0.666667,1.000000,0.777778,1.000000,1.000000,0.933333,0.333333,0.000000,0.333333,1.000000
6013,1.000000,1.000000,0.000000,0.666667,0.333333,1.000000,0.444444,0.333333,0.000000,0.0,...,0.888889,1.000000,0.666667,1.000000,1.000000,0.933333,0.666667,0.000000,0.333333,0.888889
2992,0.888889,0.444444,0.000000,1.000000,1.000000,1.000000,0.777778,0.444444,0.666667,0.0,...,0.444444,1.000000,0.666667,1.000000,1.000000,1.000000,1.000000,0.000000,0.000000,0.888889
150,1.000000,1.000000,0.000000,0.666667,1.000000,0.666667,0.555556,0.444444,0.000000,0.0,...,0.888889,1.000000,0.777778,1.000000,1.000000,0.866667,1.000000,0.333333,0.666667,1.000000
1827,0.777778,0.777778,0.666667,1.000000,1.000000,1.000000,0.666667,0.333333,0.000000,0.0,...,1.000000,0.888889,0.777778,0.888889,1.000000,0.800000,0.000000,0.000000,0.333333,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3563,0.666667,0.777778,0.000000,0.333333,1.000000,0.666667,0.666667,0.444444,0.333333,0.0,...,0.777778,1.000000,0.666667,1.000000,1.000000,0.800000,0.333333,0.333333,1.000000,0.888889
1999,0.777778,0.555556,0.000000,0.333333,1.000000,0.666667,0.666667,0.444444,0.333333,0.0,...,0.666667,0.777778,0.500000,0.888889,1.000000,0.666667,0.333333,0.333333,0.666667,0.777778
213,0.888889,0.888889,0.000000,1.000000,1.000000,1.000000,0.666667,0.555556,0.000000,0.0,...,0.888889,1.000000,0.666667,1.000000,1.000000,0.933333,0.666667,0.333333,0.333333,1.000000
4232,0.777778,0.333333,0.000000,1.000000,1.000000,0.666667,0.333333,0.555556,0.333333,0.0,...,0.777778,1.000000,0.777778,1.000000,0.833333,0.800000,1.000000,0.000000,0.000000,0.888889


In [122]:
df_test.head()

Unnamed: 0,qa_id,question_title,question_body,question_user_name,question_user_page,answer,answer_user_name,answer_user_page,url,category,host
0,39,Will leaving corpses lying around upset my pri...,I see questions/information online about how t...,Dylan,https://gaming.stackexchange.com/users/64471,There is no consequence for leaving corpses an...,Nelson868,https://gaming.stackexchange.com/users/97324,http://gaming.stackexchange.com/questions/1979...,CULTURE,gaming.stackexchange.com
1,46,Url link to feature image in the portfolio,I am new to Wordpress. i have issue with Featu...,Anu,https://wordpress.stackexchange.com/users/72927,I think it is possible with custom fields.\n\n...,Irina,https://wordpress.stackexchange.com/users/27233,http://wordpress.stackexchange.com/questions/1...,TECHNOLOGY,wordpress.stackexchange.com
2,70,"Is accuracy, recoil or bullet spread affected ...","To experiment I started a bot game, toggled in...",Konsta,https://gaming.stackexchange.com/users/37545,You do not have armour in the screenshots. Thi...,Damon Smithies,https://gaming.stackexchange.com/users/70641,http://gaming.stackexchange.com/questions/2154...,CULTURE,gaming.stackexchange.com
3,132,Suddenly got an I/O error from my external HDD,I have used my Raspberry Pi as a torrent-serve...,robbannn,https://raspberrypi.stackexchange.com/users/17341,Your Western Digital hard drive is disappearin...,HeatfanJohn,https://raspberrypi.stackexchange.com/users/1311,http://raspberrypi.stackexchange.com/questions...,TECHNOLOGY,raspberrypi.stackexchange.com
4,200,Passenger Name - Flight Booking Passenger only...,I have bought Delhi-London return flights for ...,Amit,https://travel.stackexchange.com/users/29089,I called two persons who work for Saudia (tick...,Nean Der Thal,https://travel.stackexchange.com/users/10051,http://travel.stackexchange.com/questions/4704...,CULTURE,travel.stackexchange.com


### Check the scores using spearmanr from scipy

In [120]:
# performance of each column in the validation dataset, model 1
from scipy.stats import spearmanr

df_spearman = pd.DataFrame(df_valid_y.columns,columns=["Label"])
coefs = []

for i in range(30):
    coef,p = spearmanr(df_valid_pred.iloc[:,i], df_valid_y.iloc[:,i])
    coefs.append(coef)

df_spearman["Score"] = coefs
df_spearman

Unnamed: 0,Label,Score
0,question_asker_intent_understanding,0.737226
1,question_body_critical,0.881443
2,question_conversational,0.512731
3,question_expect_short_answer,0.813718
4,question_fact_seeking,0.833274
5,question_has_commonly_accepted_answer,0.776549
6,question_interestingness_others,0.714254
7,question_interestingness_self,0.809268
8,question_multi_intent,0.823193
9,question_not_really_a_question,0.157999


The scores of question_not_really_a_question(Label 10, 0.157999), question_type_consequence(Label 14, 0.187766) and question_type_spelling(Label 20, NaN) is significantly lower than others.

After checking the train set, we find the data of these columns is imbalanced. So, we're considering solving this problem using imblearn or other methods.

## Run the model on the test set

In [38]:
test_predictions = []

In [40]:
for model in models:
    test_predictions.append(model.predict(test_inputs, batch_size=8, verbose=1)) 

In [41]:
final_predictions = np.mean(test_predictions, axis=0)

In [42]:
df_sub.iloc[:, 1:] = final_predictions
df_sub.to_csv('submission.csv', index=False)

In [43]:
df_sub.head()

Unnamed: 0,qa_id,question_asker_intent_understanding,question_body_critical,question_conversational,question_expect_short_answer,question_fact_seeking,question_has_commonly_accepted_answer,question_interestingness_others,question_interestingness_self,question_multi_intent,...,question_well_written,answer_helpful,answer_level_of_information,answer_plausible,answer_relevance,answer_satisfaction,answer_type_instructions,answer_type_procedure,answer_type_reason_explanation,answer_well_written
0,39,0.938958,0.627481,0.1668,0.514917,0.576923,0.622745,0.676879,0.648084,0.729218,...,0.887719,0.919156,0.604636,0.973368,0.963013,0.798283,0.028482,0.020496,0.89888,0.909044
1,46,0.881514,0.450305,0.006302,0.778764,0.729956,0.920597,0.584292,0.499064,0.071643,...,0.669986,0.962445,0.635315,0.980587,0.979045,0.878402,0.94789,0.075705,0.032106,0.904202
2,70,0.884626,0.625738,0.013018,0.840316,0.880019,0.968759,0.609891,0.463821,0.160505,...,0.830779,0.925678,0.587583,0.966551,0.969164,0.795232,0.072927,0.044401,0.845021,0.905604
3,132,0.890446,0.398783,0.005364,0.725932,0.784621,0.939685,0.550969,0.413887,0.067337,...,0.683051,0.943653,0.678593,0.972033,0.98126,0.889558,0.818465,0.12361,0.496863,0.899144
4,200,0.916232,0.40829,0.02144,0.886373,0.700372,0.888922,0.65518,0.603831,0.033444,...,0.631891,0.926608,0.628834,0.97534,0.963328,0.849761,0.226559,0.118528,0.615895,0.899786
