In [1]:
import sys, os, re, csv, codecs, numpy as np, pandas as pd

In [2]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [3]:
!pwd

/home/adam/R/Yelp


In [3]:
TRAIN_DATA_FILE = f'/home/adam/R/Yelp/dataset/train.csv'
EMBEDDING_FILE = f'/home/adam/R/Yelp/dataset/glove.6B.50d.txt'
TEST_DATA_FILE = f'/home/adam/R/Yelp/dataset/test.csv'

In [4]:
# config parameters
embed_size = 50 # how big is each word vector
max_features = 20000 # how many unique words to use (i.e. num rows in embedding vector)
maxlen = 100 # max number of words in a comment to use

In [5]:
train = pd.read_csv(TRAIN_DATA_FILE)

In [6]:
test = pd.read_csv(TEST_DATA_FILE)

In [7]:
list_sentences_train = train['comment_text'].fillna("_na_").values

In [8]:
list_classes = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
y = train[list_classes].values

In [9]:
list_sentences_test = test['comment_text'].fillna("_na_").values

In [13]:
list_sentences_train

array(["Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27",
       "D'aww! He matches this background colour I'm seemingly stuck with. Thanks.  (talk) 21:51, January 11, 2016 (UTC)",
       "Hey man, I'm really not trying to edit war. It's just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page. He seems to care more about the formatting than the actual info.",
       ...,
       'Spitzer \n\nUmm, theres no actual article for prostitution ring.  - Crunch Captain.',
       'And it looks like it was actually you who put on the speedy to have the first version deleted now that I look at it.',
       '"\nAnd ... I really don\'t think you understand.  I came here and my idea was bad right away.  What kind of communit

In [10]:
tokenizer = Tokenizer(num_words = max_features)

In [11]:
tokenizer.fit_on_texts(list(list_sentences_train))

In [12]:
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)

In [13]:
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)

In [14]:
X_t = pad_sequences(list_tokenized_train, maxlen = maxlen)
X_te = pad_sequences(list_tokenized_test, maxlen = maxlen)

In [15]:
def get_coefs(word, *arr):
    return word, np.asarray(arr, dtype = 'float32')

In [16]:
embeddings_index = dict(get_coefs(*o.strip().split()) for o in open(EMBEDDING_FILE))

In [17]:
all_embs = np.stack(embeddings_index.values())

In [18]:
emb_mean, emb_std = all_embs.mean(), all_embs.std()

In [19]:
word_index = tokenizer.word_index

In [20]:
nb_words = min(max_features, len(word_index))

In [21]:
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))

In [22]:
for word, i in word_index.items():
    if i >= max_features: 
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [39]:
embedding_matrix

array([[ 0.86715237,  0.99219564, -0.43841532, ...,  0.68932183,
         0.17521274,  0.1799429 ],
       [ 0.41800001,  0.24968   , -0.41242   , ..., -0.18411   ,
        -0.11514   , -0.78580999],
       [ 0.68046999, -0.039263  ,  0.30186   , ..., -0.073297  ,
        -0.064699  , -0.26043999],
       ...,
       [-0.13491   , -0.8635    , -0.033898  , ...,  0.29484999,
        -0.24315999,  0.81682003],
       [ 0.1178    ,  0.14624   , -0.28240001, ..., -0.19529   ,
        -0.13610999,  1.04270005],
       [-0.64388001, -0.54152   ,  0.10305   , ..., -0.06732   ,
        -0.73308003,  0.88625002]])

In [23]:
inp = Input(shape = (maxlen,))
x = Embedding(max_features, embed_size, weights = [embedding_matrix])(inp)
x = Bidirectional(LSTM(50, return_sequences=True, dropout = 0.1, recurrent_dropout=0.1))(x)
x = GlobalMaxPool1D()(x)
x = Dense(50, activation='relu')(x)
x = Dropout(0.1)(x)
x = Dense(6, activation = 'sigmoid')(x)
model = Model(inputs = inp, outputs = x)
model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

In [24]:
model.fit(X_t, y, batch_size=512, epochs = 2, validation_split=0.1)

Train on 143613 samples, validate on 15958 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7fae133a6c18>

In [25]:
y_test = model.predict([X_te], batch_size = 1024, verbose = 1)



In [30]:
tr_fitted = model.predict([X_t], batch_size = 1024, verbose = 1)



In [40]:
tr_fitted[1253]

array([0.83204603, 0.03617616, 0.37597415, 0.02444025, 0.34808108,
       0.05294542], dtype=float32)

In [37]:
X_t[1232]

array([    0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
          73,    42,    79,     3,  1231,     8,    41,   134,   167,
         240,    21,     1,  3422,    25,    39,     1,  1194,   231,
         578,   141,    12,    35,   305,  3409,   117,  3841,    29,
        1409,     3, 19538,    24,   210,     5,  2157,     2,  1980,
          21], dtype=int32)

In [39]:
y[1252:1262]

array([[0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]])