# Let's load model weights

In [4]:
!pip install keras



In [11]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, Input, Dense, CuDNNGRU,concatenate, Bidirectional, SpatialDropout1D, Conv1D, GlobalAveragePooling1D, GlobalMaxPooling1D
from keras.optimizers import RMSprop, Adam
from keras.models import Model
from keras.callbacks import EarlyStopping

In [6]:
import pickle

MAX_NUM_WORDS = 100000

tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
with open('./tokenizer.gru.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

In [7]:
import pandas as pd

embedding_matrix = pd.read_csv('./embedding_matrix.gru.csv')

In [8]:
import numpy as np

embedding_matrix_numpy = np.array([[*data[1:500]] for row, data in embedding_matrix.iterrows()])
embedding_matrix_numpy.shape

(301222, 300)

In [12]:
MAX_SEQUENCE_LENGTH=256
EMBEDDINGS_DIMENSION = 300

sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedding_layer = Embedding(len(tokenizer.word_index) + 1,
                            EMBEDDINGS_DIMENSION,
                            weights=[embedding_matrix_numpy],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)
x = embedding_layer(sequence_input)
x = SpatialDropout1D(0.2)(x)
x = Bidirectional(CuDNNGRU(64, return_sequences=True))(x)   
x = Conv1D(64, kernel_size = 2, padding = "valid", kernel_initializer = "he_uniform")(x)

avg_pool1 = GlobalAveragePooling1D()(x)
max_pool1 = GlobalMaxPooling1D()(x)     

x = concatenate([avg_pool1, max_pool1])

preds = Dense(1, activation='sigmoid')(x)


model = Model(sequence_input, preds)
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, 256)          0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 256, 300)     90366600    input_2[0][0]                    
__________________________________________________________________________________________________
spatial_dropout1d_2 (SpatialDro (None, 256, 300)     0           embedding_2[0][0]                
__________________________________________________________________________________________________
bidirectional_2 (Bidirectional) (None, 256, 128)     140544      spatial_dropout1d_2[0][0]        
__________________________________________________________________________________________________
conv1d_2 (

In [13]:
model.load_weights('./gru.h5')

## Testing

In [33]:
def pad_text(texts, tokenizer):
    return pad_sequences(tokenizer.texts_to_sequences(texts), maxlen=MAX_SEQUENCE_LENGTH)
def predictText(text):
  text = pd.Series([text])
  text = pad_text(text, tokenizer)
  res = model.predict(text)
  return res[0][0]

#### Let's test model on test data

The main challenge was to make model not to react only on swears. For this we can easily use bias model, but NLP is something more complex.

In [2]:
predictText("I don't like  people")

0.5973547


In [3]:
predictText("I don't like this movie")

0.034131106


Above we could see that model do not only react on words like "hate" and "don't like", but also to context of the sentence.

In [154]:
test = pd.read_csv("./data/test.csv")
test = test[500:1000]

In [161]:
test["prediction"] = [predictText(d['comment_text']) for i, d in test.iterrows()]

In [163]:
toxic_test = test[test['prediction'] > 0.5]

In [165]:
len(toxic_test)

17

17 comments of 500 were detected as toxic. 

Most of them were detected because of swears or personal insult.

In [None]:
toxic_test = [*toxic_test['comment_text']]

In [168]:
toxic_test[1]

"The NDP claims that they will eliminate corporate and union funding from BC Politics.  Until that happens, they'll take contributions from anywhere they can, including a foreign union.  Hypocrites."

In [169]:
toxic_test[2]

'What fools these Progressive Statist Mutts are.'

Two examples above are pretty easy. There are swears words and text is pretty small. But below we have a long text. Simple model would be cheted by good words around.

In [170]:
toxic_test[3]

'Even a liberal idiot can understand the law:  “Whenever the President finds that the entry of any aliens or of any class of aliens into the United States would be detrimental to the interests of the United States, he may by proclamation, and for such period as he shall deem necessary, suspend the entry of all aliens or any class of aliens as immigrants or nonimmigrants, or impose on the entry of aliens any restrictions he may deem to be appropriate.”  Then again maybe not if a liberal judge is the smartest you have.'