In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers as L
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tqdm import tqdm
EMBED_DIM = 192

In [2]:
df = pd.read_json('../input/sentimentextraction/train.json')
df = df[['text','important_span_text','sentiment']]
df = df.rename({'important_span_text':'selected_text'}, axis = 1)

context_tokenizer = Tokenizer()
context_tokenizer.fit_on_texts(df.text.fillna(''))
context = context_tokenizer.texts_to_sequences(df.text.fillna(''))
answers = context_tokenizer.texts_to_sequences(df.selected_text.fillna(''))

beg_pos = [[1 if a[x:x+len(b)] == b else 0 for x in range(len(a))] for a, b in zip(context, answers)]
end_pos = [[1 if a[x:x+len(b)] == b else 0 for x in range(len(a))] for a, b in zip(context, answers)]

context = np.array(pad_sequences(context, maxlen=EMBED_DIM, padding='post', truncating='post'))
beg_pos = np.array(pad_sequences(beg_pos, maxlen=EMBED_DIM, padding='post', truncating='post'))
end_pos = np.array(pad_sequences(end_pos, maxlen=EMBED_DIM, padding='post', truncating='post'))

all_zero = np.all((beg_pos == 0), axis=1)

context = context[~all_zero]
beg_pos = beg_pos[~all_zero]
end_pos = end_pos[~all_zero]

beg_pos = np.expand_dims(beg_pos, axis=2)
end_pos = np.expand_dims(end_pos, axis=2)
ans_vec = np.concatenate((beg_pos, end_pos), axis=2)

question_tokenizer = Tokenizer()
question_tokenizer.fit_on_texts(df.sentiment.fillna(''))
question = question_tokenizer.texts_to_sequences(df.sentiment.fillna(''))
question = np.array(pad_sequences(question, maxlen=EMBED_DIM, padding='post', truncating='post'))
question = question[~all_zero]

context_train, context_valid, question_train, question_valid, ans_vec_train, ans_vec_valid = train_test_split(
    context, question, ans_vec, test_size=0.2, random_state=0
)

# Training

In [3]:
N_REC = 150

context_inp = L.Input(shape=(EMBED_DIM, ), name='context')
question_inp = L.Input(shape=(EMBED_DIM, ), name='question')

context_emb = L.Embedding(len(context_tokenizer.word_index)+1, EMBED_DIM, name='context_embeddings')(context_inp)
question_emb = L.Embedding(len(question_tokenizer.word_index)+1, EMBED_DIM, name='question_embeddings')(question_inp)

context_emb = L.GRU(N_REC, return_sequences=True, name='context_gru')(context_emb)
question_emb = L.GRU(N_REC, return_sequences=True, name='question_gru')(question_emb)

concat_emb = L.Concatenate(axis=-1, name='concatenate')([context_emb, question_emb])

outputs = L.Dense(2, activation='sigmoid', name='outputs')(concat_emb)

model = keras.Model(inputs=[context_inp, question_inp], outputs=outputs)
model.compile(loss='categorical_crossentropy', optimizer=keras.optimizers.Adam(3e-5))

In [4]:
es = keras.callbacks.EarlyStopping(min_delta=1e-4, patience=5, verbose=1, restore_best_weights=True, monitor = 'val_loss')

history = model.fit(
    [context_train, question_train], ans_vec_train, validation_data=([context_valid, question_valid], ans_vec_valid),
    epochs=200, callbacks=[es]
)

# Inference

In [None]:
test = pd.read_csv('../input/sentimentextraction/sentimentdone.csv')[['text','sentiment']]
test["sentiment"] = ["positive" if i == 1 else "negative" for i in test.sentiment]
contexts = context_tokenizer.texts_to_sequences(test.text.fillna(''))
contexts = np.array(pad_sequences(contexts, maxlen=EMBED_DIM, padding='post', truncating='post'))
questions = question_tokenizer.texts_to_sequences(test.sentiment.fillna(''))
questions = np.array(pad_sequences(questions, maxlen=EMBED_DIM, padding='post', truncating='post'))

In [None]:
ids = []
starts = []
ends = []
for i in tqdm(range(contexts.shape[0])):
    idx = i
    query_context = contexts[idx:idx+1]
    query_question = questions[idx:idx+1]
    text = context_tokenizer.sequences_to_texts(query_context)[0]
    pred_ans_beg, pred_ans_end = np.ravel(model([query_context, query_question]).numpy().argmax(axis=1))
    ans =  context_tokenizer.sequences_to_texts([query_context[0][pred_ans_beg: pred_ans_end+1]])[0]
    
    ids.append(i)
    starts.append(text.index(ans))
    ends.append(text.index(ans) + len(ans))

In [None]:
sub = pd.DataFrame({'id':ids, 'start':starts, 'end':ends})
sub.to_csv("submission.csv", index = False)