In [34]:
#https://pypi.org/project/keras-self-attention/
import pandas as pd
import numpy as np

from tensorflow import keras
from tensorflow.keras.preprocessing.text import one_hot, Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Flatten, Dense

from tensorflow.keras.models import Sequential
from keras_self_attention import SeqSelfAttention

In [7]:
df = pd.read_csv('./train.csv')

In [None]:
df.head()

In [8]:
trainig_sample = df.sample(100000, random_state=0)
X_train = trainig_sample['comment_text'].astype(str)
X_train = X_train.fillna('DUMMY')
y_train = trainig_sample['target']
y_train = y_train.apply(lambda x: 1 if x > 0.5 else 0)

In [10]:
def get_seqs(text):
    sequences = tokenizer.texts_to_sequences(text)
    padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')
    return padded_sequences

In [9]:
epochs = 2
max_num_words = 20000
max_length = 128
tokenizer = Tokenizer(num_words=max_num_words)
tokenizer.fit_on_texts(X_train)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 89735 unique tokens.


In [None]:
X_train = get_seqs(X_train)

In [20]:
model = Sequential()
model.add(Embedding(max_num_words, 100, input_length=max_length))
model.add(Bidirectional(LSTM(units=128, return_sequences=True, dropout=0.2, recurrent_dropout=0.2)))
model.add(SeqSelfAttention(attention_activation='sigmoid'))
model.add(Bidirectional(LSTM(units=64, return_sequences=True, dropout=0.2, recurrent_dropout=0.2)))
model.add(SeqSelfAttention(attention_activation='sigmoid'))
model.add(Dense(1, activation='sigmoid'))

In [21]:
model.summary()

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 128, 100)          2000000   
_________________________________________________________________
bidirectional_6 (Bidirection (None, 128, 256)          234496    
_________________________________________________________________
seq_self_attention_4 (SeqSel (None, None, 256)         16449     
_________________________________________________________________
bidirectional_7 (Bidirection (None, None, 128)         164352    
_________________________________________________________________
seq_self_attention_5 (SeqSel (None, None, 128)         8257      
_________________________________________________________________
dense_4 (Dense)              (None, None, 1)           129       
Total params: 2,423,683
Trainable params: 2,423,683
Non-trainable params: 0
____________________________________________

In [22]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [23]:
model.fit(X_train, y_train, epochs=epochs)

Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x289f6097e80>

In [24]:
validation_sample = df.sample(500, random_state=42)
X_val = validation_sample['comment_text'].astype(str)
X_val = X_val.fillna('DUMMY')
y_val = validation_sample['target']
y_val = y_val.apply(lambda x: 1 if x > 0.5 else 0)

In [25]:
loss, accuracy = model.evaluate(get_seqs(X_val), y_val)
print('Evaluation accuracy: {0}'.format(accuracy))

Evaluation accuracy: 0.9660000205039978


In [26]:
test = pd.read_csv('./test.csv')

In [27]:
X_test = test['comment_text'].astype(str)
X_test = X_test.fillna('DUMMY')

In [28]:
probs = model.predict(get_seqs(X_test), verbose=1)



In [29]:
probs = [x[0] for x in probs]

In [30]:
probs

[array([0.01149434], dtype=float32),
 array([0.0076099], dtype=float32),
 array([0.05558798], dtype=float32),
 array([0.00464401], dtype=float32),
 array([0.0032472], dtype=float32),
 array([0.00287113], dtype=float32),
 array([0.90897447], dtype=float32),
 array([0.17860287], dtype=float32),
 array([0.01280928], dtype=float32),
 array([0.01069167], dtype=float32),
 array([0.00590619], dtype=float32),
 array([0.01839778], dtype=float32),
 array([0.02178353], dtype=float32),
 array([0.01666182], dtype=float32),
 array([0.01620874], dtype=float32),
 array([0.01763019], dtype=float32),
 array([0.01866388], dtype=float32),
 array([0.00272807], dtype=float32),
 array([0.02234602], dtype=float32),
 array([0.59472203], dtype=float32),
 array([0.00873965], dtype=float32),
 array([0.1651375], dtype=float32),
 array([0.00640672], dtype=float32),
 array([0.00263995], dtype=float32),
 array([0.01524752], dtype=float32),
 array([0.02915969], dtype=float32),
 array([0.2838744], dtype=float32),
 arra

In [31]:
model.save("attention_md.h5")

In [18]:
submission = pd.DataFrame(test['id']).reset_index(drop=True)
submission['prediction'] = pd.Series(probs, name='prediction')
submission.to_csv('submission.csv', index=False)