In [1]:
import tokenizers, math, pickle
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow.keras.backend as kfold
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold as stratkfold
from transformers import *

In [2]:
# Constants
LEN = 96
EPOCHS = 3
BATCH_SIZE = 32
SEED = 88888
FOLDS = 5
MAP_SENTI_TO_ID = {'positive': 1313,
                   'negative': 2430,
                   'neutral': 7974}

path_vocab = '../input/tf-roberta/vocab-roberta-base.json'
path_merge = '../input/tf-roberta/merges-roberta-base.txt'
path_pretrained = '../input/tf-roberta/pretrained-roberta-base.h5'
path_train = '../input/tweet-sentiment-extraction/train.csv'
path_test = '../input/tweet-sentiment-extraction/test.csv'
path_config = '../input/tf-roberta/config-roberta-base.json'

In [3]:
# Utils
def jaccard_idx(str1, str2):
    set1 = set(str1.lower().split())
    set2 = set(str2.lower().split())
    if (len(set1)==0) & (len(set2)==0):
        return 0.5
    else:
        set3 = set1.intersection(set2)
        return float(len(set3)) / (len(set1) + len(set2) - len(set3))

def dump_data(mod, path):
    wei = mod.get_weights()
    with open(path, 'wb') as fd:
        pickle.dump(wei, fd)

def load_data(mod, path):
    with open(path, 'rb') as fd:
        wei = pickle.load(fd)
    mod.set_weights(wei)
    return mod

def cal_loss(yt, yhat):
    pos = tf.shape(yhat)[1]
    yt = yt[:, :pos]
    ret = tf.keras.losses.categorical_crossentropy(yt, yhat, from_logits=False, label_smoothing=0.1)
    ret = tf.reduce_mean(ret)
    return ret

In [4]:
# Initialize
train = pd.read_csv(path_train).fillna('')
test = pd.read_csv(path_test).fillna('')
config = RobertaConfig.from_pretrained(path_config)
tknzer = tokenizers.ByteLevelBPETokenizer(vocab_file=path_vocab, merges_file=path_merge, lowercase=True, add_prefix_space=True)
strat = stratkfold(n_splits=FOLDS, shuffle=True, random_state=SEED)

tf.random.set_seed(SEED)
np.random.seed(SEED)

in_ids = np.ones((train.shape[0],LEN),dtype='int32')
in_ids_t = np.ones((test.shape[0],LEN),dtype='int32')
mask = np.zeros((train.shape[0],LEN),dtype='int32')
mask_t = np.zeros((test.shape[0],LEN),dtype='int32')
types = np.zeros((train.shape[0],LEN),dtype='int32')
types_t = np.zeros((test.shape[0],LEN),dtype='int32')
start = np.zeros((train.shape[0],LEN),dtype='int32')
end = np.zeros((train.shape[0],LEN),dtype='int32')

In [5]:
def convertTrainData():
    for cur in range(train.shape[0]):
        str1 = " "+" ".join(train.loc[cur,'text'].split())
        str2 = " ".join(train.loc[cur,'selected_text'].split())

        vec = np.zeros((len(str1)))
        pos = str1.find(str2)
        vec[pos:pos+len(str2)]=1

        if str1[pos-1]==' ': vec[pos-1] = 1
        enc = tknzer.encode(str1)

        gather = []
        pos=0
        for i in enc.ids:
            w = tknzer.decode([i])
            gather.append((pos, pos+len(w)))
            pos += len(w)


        tokens = []
        for i,(a,b) in enumerate(gather):
            sm = np.sum(vec[a:b])
            if sm>0:
                tokens.append(i)

        stok = MAP_SENTI_TO_ID[train.loc[cur, 'sentiment']]
        in_ids[cur, :len(enc.ids) + 3] = [0, stok] + enc.ids + [2]
        mask[cur, :len(enc.ids) + 3] = 1
        if len(tokens)>0:
            start[cur, tokens[0] + 2] = 1
            end[cur, tokens[-1] + 2] = 1

def convertTestData():
    for cur in range(test.shape[0]):
        str1 = " "+" ".join(test.loc[cur,'text'].split())
        enc = tknzer.encode(str1)
        stok = MAP_SENTI_TO_ID[test.loc[cur,'sentiment']]
        in_ids_t[cur,:len(enc.ids)+3] = [0, stok] + enc.ids + [2]
        mask_t[cur,:len(enc.ids)+3] = 1

def build_model():
    _ids = tf.keras.layers.Input((LEN,), dtype=tf.int32)
    _mask = tf.keras.layers.Input((LEN,), dtype=tf.int32)
    _types = tf.keras.layers.Input((LEN,), dtype=tf.int32)
    padding = tf.cast(tf.equal(_ids, 1), tf.int32)

    cur_len = tf.reduce_max(LEN - tf.reduce_sum(padding, -1))
    new_ids = _ids[:, :cur_len]
    new_mask = _mask[:, :cur_len]
    new_types = _types[:, :cur_len]


    x_val = TFRobertaModel.from_pretrained(path_pretrained, config=config)(
        new_ids,
        attention_mask=new_mask,
        token_type_ids=new_types
    )

    x_val1 = tf.keras.layers.Dropout(0.1)(x_val[0])
    x_val1 = tf.keras.layers.Conv1D(768, 2,padding='same')(x_val1)
    x_val1 = tf.keras.layers.LeakyReLU()(x_val1)
    x_val1 = tf.keras.layers.Dense(1)(x_val1)
    x_val1 = tf.keras.layers.Flatten()(x_val1)
    x_val1 = tf.keras.layers.Activation('softmax')(x_val1)

    x_val2 = tf.keras.layers.Dropout(0.1)(x_val[0])
    x_val2 = tf.keras.layers.Conv1D(768, 2,padding='same')(x_val2)
    x_val2 = tf.keras.layers.LeakyReLU()(x_val2)
    x_val2 = tf.keras.layers.Dense(1)(x_val2)
    x_val2 = tf.keras.layers.Flatten()(x_val2)
    x_val2 = tf.keras.layers.Activation('softmax')(x_val2)

    new_model = tf.keras.models.Model(inputs=[_ids, _mask, _types], outputs=[x_val1,x_val2])
    new_model.compile(loss=cal_loss, optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5))
    new_pad_model = tf.keras.models.Model(inputs=[_ids, _mask, _types], outputs=[tf.pad(x_val1, [[0, 0], [0, LEN - cur_len]], constant_values=0.),
                                                                                 tf.pad(x_val2, [[0, 0], [0, LEN - cur_len]], constant_values=0.)])
    return new_model, new_pad_model

In [6]:
convertTrainData()
convertTestData()

In [7]:
rets = []

pstart = np.zeros((in_ids.shape[0],LEN))
pend = np.zeros((in_ids.shape[0],LEN))

pSsum = np.zeros((in_ids_t.shape[0],LEN))
pEsum = np.zeros((in_ids_t.shape[0],LEN))
histories = []

for curFold, (idx1, idx2) in enumerate(strat.split(in_ids,train.sentiment.values)):
    print("Fold " + str(curFold+1) + ":")
    fname = "fold" + str(curFold+1) + ".h5"

    kfold.clear_session()
    model, pad_model = build_model()

    inp1 = [in_ids[idx1,], mask[idx1,], types[idx1,]]
    target1 = [start[idx1,], end[idx1,]]

    inp2 = [in_ids[idx2,], mask[idx2,], types[idx2,]]
    target2 = [start[idx2,], end[idx2,]]

    sortedData2 = np.int32(sorted(range(len(inp2[0])),
                               key=lambda i: (inp2[0][i] == 1).sum(),
                               reverse=True))

    inp2 = [arr[sortedData2] for arr in inp2]
    target2 = [arr[sortedData2] for arr in target2]

    for epo in range(1, EPOCHS + 1):
        sortedData1 = np.int32(sorted(range(len(inp1[0])),
                                   key=lambda i: (inp1[0][i] == 1).sum() + np.random.randint(-3, 3),
                                   reverse=True))
        idxs = np.random.permutation(math.ceil(len(sortedData1) / BATCH_SIZE))
        tmp = []
        for idx in idxs:
            tmp.append(sortedData1[idx * BATCH_SIZE: (idx + 1) * BATCH_SIZE])
        sortedData1 = np.concatenate(tmp)

        inp1 = [arr[sortedData1] for arr in inp1]
        target1 = [arr[sortedData1] for arr in target1]

        histories.append(model.fit(inp1, target1, epochs=epo, initial_epoch=epo - 1,
                  batch_size=BATCH_SIZE, verbose=1, callbacks=[],
                  validation_data=(inp2, target2), shuffle=False))
        dump_data(model, fname)

    load_data(model, fname)
    pstart[idx2,], pend[idx2,] = pad_model.predict([in_ids[idx2,], mask[idx2,], types[idx2,]], verbose=1)

    pSE = pad_model.predict([in_ids_t, mask_t, types_t], verbose=1)
    pSsum += pSE[0]/strat.n_splits
    pEsum += pSE[1]/strat.n_splits

    jacs = []
    for idxCur in idx2:
        maxS = np.argmax(pstart[idxCur,])
        maxE = np.argmax(pend[idxCur,])
        if maxS>maxE:
            st = train.loc[idxCur,'text']
        else:
            enc = tknzer.encode(" "+" ".join(train.loc[idxCur,'text'].split()))
            st = tknzer.decode(enc.ids[maxS-2: maxE-1])
        jacs.append(jaccard_idx(st, train.loc[idxCur, 'selected_text']))
    rets.append(np.mean(jacs))
    print("Fold " + str(curFold+1) + " Jaccard = " + str(np.mean(jacs)) + "\n")

In [8]:
print('Mean Jaccard of Folds = ',np.mean(rets))

In [9]:
print(rets)

In [16]:

show = []
show2 = []
cnt = 0
one = []
one2 = []

for his in histories:
    one.append(his.history['loss'])
    one2.append(his.history['val_loss'])
#     print(his.history['loss'])
    if len(one) >= EPOCHS:
#         print("in")
        show.append(one)
        show2.append(one2)
        one = []
        one2 = []

f = 1
for i in range(len(show)):
    
    plt.plot(show[i])
    plt.plot(show2[i])
    plt.title("Fold " + str(f) + ": model loss")
    f+=1
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper left')
    plt.show()