In [1]:
import pandas as pd
import numpy as np
np.random.seed(0)
from gensim.models import Word2Vec
from nltk import word_tokenize
import xgboost as xgb

In [2]:
raw_df = pd.read_json('news-headlines-dataset-for-sarcasm-detection/Sarcasm_Headlines_Dataset.json', lines=True)

In [3]:
raw_df.head()

Unnamed: 0,article_link,headline,is_sarcastic
0,https://www.huffingtonpost.com/entry/versace-b...,former versace store clerk sues over secret 'b...,0
1,https://www.huffingtonpost.com/entry/roseanne-...,the 'roseanne' revival catches up to our thorn...,0
2,https://local.theonion.com/mom-starting-to-fea...,mom starting to fear son's web series closest ...,1
3,https://politics.theonion.com/boehner-just-wan...,"boehner just wants wife to listen, not come up...",1
4,https://www.huffingtonpost.com/entry/jk-rowlin...,j.k. rowling wishes snape happy birthday in th...,0


In [4]:
raw_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26709 entries, 0 to 26708
Data columns (total 3 columns):
article_link    26709 non-null object
headline        26709 non-null object
is_sarcastic    26709 non-null int64
dtypes: int64(1), object(2)
memory usage: 626.1+ KB


In [5]:
raw_df = raw_df.drop(raw_df.columns[0], axis=1)

In [6]:
raw_df.head()

Unnamed: 0,headline,is_sarcastic
0,former versace store clerk sues over secret 'b...,0
1,the 'roseanne' revival catches up to our thorn...,0
2,mom starting to fear son's web series closest ...,1
3,"boehner just wants wife to listen, not come up...",1
4,j.k. rowling wishes snape happy birthday in th...,0


In [7]:
target = raw_df.is_sarcastic

In [8]:
df = raw_df.headline
data = df.map(word_tokenize).values

In [9]:
data[:5]

array([list(['former', 'versace', 'store', 'clerk', 'sues', 'over', 'secret', "'black", 'code', "'", 'for', 'minority', 'shoppers']),
       list(['the', "'roseanne", "'", 'revival', 'catches', 'up', 'to', 'our', 'thorny', 'political', 'mood', ',', 'for', 'better', 'and', 'worse']),
       list(['mom', 'starting', 'to', 'fear', 'son', "'s", 'web', 'series', 'closest', 'thing', 'she', 'will', 'have', 'to', 'grandchild']),
       list(['boehner', 'just', 'wants', 'wife', 'to', 'listen', ',', 'not', 'come', 'up', 'with', 'alternative', 'debt-reduction', 'ideas']),
       list(['j.k.', 'rowling', 'wishes', 'snape', 'happy', 'birthday', 'in', 'the', 'most', 'magical', 'way'])],
      dtype=object)

In [10]:
total_vocabulary = set(word for headline in data for word in headline)

In [11]:
len(total_vocabulary)
print("There are {} unique tokens in our dataset.".format(len(total_vocabulary)))

There are 29291 unique tokens in our dataset.


In [12]:
glove = {}
with open('glove.6B.50d.txt', 'rb') as f:
    for line in f:
        parts = line.split()
        word = parts[0].decode('utf-8')
        if word in total_vocabulary:
            vector = np.array(parts[1:], dtype=np.float32)
            glove[word] = vector

In [13]:
glove['versace']

array([ 1.1284   ,  0.36283  , -0.30177  , -0.20127  ,  0.27707  ,
        0.052181 , -0.39466  , -1.4702   , -0.84226  ,  0.69612  ,
        0.0069658, -0.85032  , -1.4427   , -0.37752  ,  1.0227   ,
        0.014541 , -1.1521   , -0.2025   ,  0.44106  ,  0.044921 ,
       -0.49957  ,  0.441    , -0.57663  ,  0.079393 , -1.4346   ,
       -0.57216  , -1.6706   ,  0.83452  ,  0.25863  , -0.57889  ,
       -0.43723  ,  1.0388   , -0.33706  , -0.16978  ,  0.083735 ,
        0.018831 , -0.11455  ,  1.1945   , -0.36766  ,  0.27595  ,
        0.19627  ,  0.95083  , -0.15675  , -0.53886  ,  0.571    ,
       -0.48963  , -0.046339 , -1.2631   , -0.062511 ,  0.53633  ],
      dtype=float32)

In [14]:
class W2vVectorizer(object):
    
    def __init__(self, w2v):
        
        self.w2v = w2v
        if len(w2v) == 0:
            self.dimensions = 0
        else:
            self.dimensions = len(w2v[next(iter(glove))])
    
    
    def fit(self, X, y):
        return self
            
    def transform(self, X):
        return np.array([
            np.mean([self.w2v[w] for w in words if w in self.w2v]
                   or [np.zeros(self.dimensions)], axis=0) for words in X])

In [15]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score

rf =  Pipeline([("Word2Vec Vectorizer", W2vVectorizer(glove)),
              ("Random Forest", RandomForestClassifier(n_estimators=100, verbose=True))])
svc = Pipeline([("Word2Vec Vectorizer", W2vVectorizer(glove)),
                ('Support Vector Machine', SVC())])
lr = Pipeline([("Word2Vec Vectorizer", W2vVectorizer(glove)),
              ('Logistic Regression', LogisticRegression())])

In [16]:
models = [('Random Forest', rf),
          ("Support Vector Machine", svc),
          ("Logistic Regression", lr)]

In [17]:
scores = [(name, cross_val_score(model, data, target, cv=2).mean()) for name, model, in models]

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    8.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    8.6s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.3s finished


In [19]:
scores

[('Random Forest', 0.7341345876452336),
 ('Support Vector Machine', 0.7207683416425245),
 ('Logistic Regression', 0.7087873810569283)]

In [20]:
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, Dense, LSTM, Embedding
from keras.layers import Dropout, Activation, Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers
from keras.preprocessing import text, sequence
from sklearn.model_selection import train_test_split

Using TensorFlow backend.


In [18]:
df.head()

0    former versace store clerk sues over secret 'b...
1    the 'roseanne' revival catches up to our thorn...
2    mom starting to fear son's web series closest ...
3    boehner just wants wife to listen, not come up...
4    j.k. rowling wishes snape happy birthday in th...
Name: headline, dtype: object

In [21]:
tokenizer = text.Tokenizer(num_words=20000)
tokenizer.fit_on_texts(list(df))
list_tokenized_headlines = tokenizer.texts_to_sequences(df)

In [22]:
list_tokenized_headlines

[[307, 15114, 678, 3336, 2297, 47, 381, 2575, 15115, 5, 2576, 8433],
 [3, 8434, 3337, 2745, 21, 1, 165, 8435, 415, 3111, 5, 257, 8, 1001],
 [144, 837, 1, 906, 1748, 2092, 581, 4718, 220, 142, 38, 45, 1, 10735],
 [1484, 35, 223, 399, 1, 1831, 28, 318, 21, 9, 2923, 1392, 6968, 967],
 [766, 718, 4719, 907, 10736, 622, 593, 4, 3, 94, 1308, 91],
 [10737, 3, 364, 72],
 [3, 6969, 350, 5, 460, 4273, 2194, 1485],
 [18, 478, 38, 1167, 30, 154, 1, 98, 82, 17, 157, 5, 31, 351],
 [248, 3622, 6970, 554, 5273, 1994, 140],
 [2093, 325, 346, 400, 59, 15116, 5, 3, 3895],
 [2924, 1679, 4720, 13, 36, 4274, 6971, 4, 2094, 1102],
 [285, 781, 461, 7, 1555, 1910, 8, 3623],
 [233, 513, 2925, 12, 8, 928, 225, 368, 1, 4275, 15117, 8436],
 [237, 3896, 8437, 3338, 37, 234, 15118, 5, 6, 172],
 [15119, 1393, 664, 650, 4, 326, 2, 1030],
 [533, 2094, 10738, 122, 10739, 5, 10740, 4721, 1911],
 [2577,
  1394,
  382,
  44,
  3897,
  347,
  318,
  1031,
  1,
  23,
  15120,
  19,
  1103,
  386,
  102,
  1309],
 [1680, 8438

In [23]:
X_t = sequence.pad_sequences(list_tokenized_headlines, maxlen=100)


In [24]:
X_t

array([[    0,     0,     0, ...,     5,  2576,  8433],
       [    0,     0,     0, ...,   257,     8,  1001],
       [    0,     0,     0, ...,    45,     1, 10735],
       ...,
       [    0,     0,     0, ..., 10734,     8,    67],
       [    0,     0,     0, ...,  1730,  3802,  3561],
       [    0,     0,     0, ...,     5,     3,   824]], dtype=int32)

In [26]:
embedding_size = 128
input_ = Input(shape=(100,))
x = Embedding(20000, embedding_size)(input_)
x = LSTM(25, return_sequences=True)(x)
x = GlobalMaxPool1D()(x)
x = Dropout(0.5)(x)
x = Dense(50, activation='relu')(x)
x = Dropout(0.5)(x)
# There are 2 different possible classes, so we use 2 neurons in our output layer
x = Dense(2, activation='softmax')(x)

model = Model(inputs=input_, outputs=x)

W0902 18:25:54.095216 140439550609152 deprecation_wrapper.py:119] From /home/matthew/anaconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0902 18:25:54.098342 140439550609152 deprecation_wrapper.py:119] From /home/matthew/anaconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0902 18:25:54.312226 140439550609152 deprecation_wrapper.py:119] From /home/matthew/anaconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:133: The name tf.placeholder_with_default is deprecated. Please use tf.compat.v1.placeholder_with_default instead.

W0902 18:25:54.320943 140439550609152 deprecation.py:506] From /home/matthew/anaconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:3445: calling dropout (from tensorflow.python.ops.nn_ops) with keep_prob is dep

In [27]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

W0902 18:26:05.089885 140439550609152 deprecation_wrapper.py:119] From /home/matthew/anaconda3/lib/python3.7/site-packages/keras/optimizers.py:790: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.

W0902 18:26:05.164718 140439550609152 deprecation_wrapper.py:119] From /home/matthew/anaconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:3295: The name tf.log is deprecated. Please use tf.math.log instead.



In [28]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 100)               0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 100, 128)          2560000   
_________________________________________________________________
lstm_1 (LSTM)                (None, 100, 25)           15400     
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 25)                0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 25)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 50)                1300      
_________________________________________________________________
dropout_2 (Dropout)          (None, 50)                0         
__________

In [11]:
y = pd.get_dummies(target).values

In [31]:
model.fit(X_t, y, epochs=3, batch_size=32, validation_split=0.3)

W0902 18:27:24.807078 140439550609152 deprecation.py:323] From /home/matthew/anaconda3/lib/python3.7/site-packages/tensorflow/python/ops/math_grad.py:1250: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Train on 18696 samples, validate on 8013 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7fba544e8ac8>

In [33]:
import keras
from keras.layers import LSTM, GRU, Dense, GlobalMaxPool1D, Embedding, Dropout
from keras.preprocessing import text, sequence
from keras.models import Sequential

In [38]:
gru_model = Sequential()
gru_model.add(Embedding(20000, 128))
gru_model.add(GRU(50, return_sequences=True))
gru_model.add(GlobalMaxPool1D())
gru_model.add(Dropout(0.5))
gru_model.add(Dense(50, activation='relu'))
gru_model.add(Dropout(0.5))
gru_model.add(Dense(2, activation='softmax'))

In [39]:
gru_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [40]:
gru_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, None, 128)         2560000   
_________________________________________________________________
gru_2 (GRU)                  (None, None, 50)          26850     
_________________________________________________________________
global_max_pooling1d_3 (Glob (None, 50)                0         
_________________________________________________________________
dropout_5 (Dropout)          (None, 50)                0         
_________________________________________________________________
dense_5 (Dense)              (None, 50)                2550      
_________________________________________________________________
dropout_6 (Dropout)          (None, 50)                0         
_________________________________________________________________
dense_6 (Dense)              (None, 2)                 102       
Total para

In [42]:
gru_model.fit(X_t, y, epochs=3, batch_size=32, validation_split=0.3)


Train on 18696 samples, validate on 8013 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7fba3eb24898>

In [13]:
import json
import pandas as pd
import numpy as np
np.random.seed(0)
from gensim.models import Word2Vec
from nltk import word_tokenize
import matplotlib.pyplot as plt
%matplotlib inline
import keras
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, Dense, LSTM, Embedding
from keras.layers import Dropout, Activation, Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers
from keras.preprocessing import text, sequence
from keras.callbacks import ModelCheckpoint
from keras.callbacks import EarlyStopping

Using TensorFlow backend.


In [14]:
tokenizer = text.Tokenizer(num_words=20000)
tokenizer.fit_on_texts(list(df))
list_tokenized_train = tokenizer.texts_to_sequences(df)
X_t = sequence.pad_sequences(list_tokenized_train, maxlen=100)

In [15]:
embedding_size = 128
input_ = Input(shape=(100,))
x = Embedding(30000, embedding_size)(input_)
x = Bidirectional(LSTM(25, return_sequences=True))(x)
x = GlobalMaxPool1D()(x)
x = Dropout(0.5)(x)
x = Dense(50, activation='relu')(x)
x = Dropout(0.5)(x)
x = Dense(2, activation='sigmoid')(x)

model = Model(inputs=input_, outputs=x)

W0905 23:43:37.648532 139746467665664 deprecation_wrapper.py:119] From /home/matthew/anaconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0905 23:43:37.884858 139746467665664 deprecation_wrapper.py:119] From /home/matthew/anaconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0905 23:43:37.939423 139746467665664 deprecation_wrapper.py:119] From /home/matthew/anaconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0905 23:43:38.691143 139746467665664 deprecation_wrapper.py:119] From /home/matthew/anaconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:133: The name tf.placeholder_with_default is deprecated. Please use tf.compat.v1.place

In [16]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

W0905 23:43:49.372870 139746467665664 deprecation_wrapper.py:119] From /home/matthew/anaconda3/lib/python3.7/site-packages/keras/optimizers.py:790: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.

W0905 23:43:49.414872 139746467665664 deprecation_wrapper.py:119] From /home/matthew/anaconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:3376: The name tf.log is deprecated. Please use tf.math.log instead.

W0905 23:43:49.423555 139746467665664 deprecation.py:323] From /home/matthew/anaconda3/lib/python3.7/site-packages/tensorflow/python/ops/nn_impl.py:180: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [17]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 100)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 100, 128)          3840000   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 100, 50)           30800     
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 50)                0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 50)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 50)                2550      
_________________________________________________________________
dropout_2 (Dropout)          (None, 50)                0         
__________

In [18]:
checkpoints_path = 'weights_base.best.hdf5'
checkpoint = ModelCheckpoint(checkpoints_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
early_stopping = EarlyStopping(monitor='val_loss', mode='min', patience=25)
callbacks = [checkpoint, early_stopping]

In [19]:
model.fit(X_t, y, batch_size=32, epochs=2, validation_split=0.1, callbacks=callbacks)

Train on 24038 samples, validate on 2671 samples
Epoch 1/2

Epoch 00001: val_loss improved from inf to 0.32147, saving model to weights_base.best.hdf5
Epoch 2/2

Epoch 00002: val_loss did not improve from 0.32147


<keras.callbacks.History at 0x7f18f85082d0>