In [1]:
import pandas as pd

In [3]:
data = pd.read_json("Sarcasm_Headlines_Dataset.json",lines=True)

In [4]:
data.head()


Unnamed: 0,article_link,headline,is_sarcastic
0,https://www.huffingtonpost.com/entry/versace-b...,former versace store clerk sues over secret 'b...,0
1,https://www.huffingtonpost.com/entry/roseanne-...,the 'roseanne' revival catches up to our thorn...,0
2,https://local.theonion.com/mom-starting-to-fea...,mom starting to fear son's web series closest ...,1
3,https://politics.theonion.com/boehner-just-wan...,"boehner just wants wife to listen, not come up...",1
4,https://www.huffingtonpost.com/entry/jk-rowlin...,j.k. rowling wishes snape happy birthday in th...,0


In [5]:
data.drop("article_link",axis=1,inplace=True)

In [6]:
data.head()

Unnamed: 0,headline,is_sarcastic
0,former versace store clerk sues over secret 'b...,0
1,the 'roseanne' revival catches up to our thorn...,0
2,mom starting to fear son's web series closest ...,1
3,"boehner just wants wife to listen, not come up...",1
4,j.k. rowling wishes snape happy birthday in th...,0


In [9]:
data['length'] = data['headline'].apply(lambda x: len(x.split()))

In [10]:
data.head()

Unnamed: 0,headline,is_sarcastic,length
0,former versace store clerk sues over secret 'b...,0,12
1,the 'roseanne' revival catches up to our thorn...,0,14
2,mom starting to fear son's web series closest ...,1,14
3,"boehner just wants wife to listen, not come up...",1,13
4,j.k. rowling wishes snape happy birthday in th...,0,11


In [28]:
data.shape

(26709, 3)

In [29]:
data.shape[0] * 0.2

5341.8

In [30]:
data.shape[0] * 0.8

21367.2

## Cleaning data 

In [118]:
from nltk.corpus import stopwords
import string

In [119]:
def clean_data(lines):
    
    
    tokens = lines.split()
    
    table = str.maketrans('','',string.punctuation)
    tokens = [w.translate(table) for w in tokens]
    
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if not w in stop_words]
    
    tokens = [w.lower() for w in tokens]
    
    tokens = [w for w in tokens if len(w) > 1]
    
    return tokens
    

In [120]:
def process_doc(headlines):
    
    processed_data = list()
    
    for h in headlines:
        tokens = clean_data(h)
        processed_data.append(tokens)
        
    return processed_data
        

In [121]:
headlines = data.headline
train_headlines = process_doc(headlines[:21368])
test_headlines = process_doc(headlines[21368:])

In [122]:
from pickle import dump

def save_data(dataset,filename):
    dump(dataset,open(filename,'wb'))
    print("saved {}".format(filename))



In [123]:
data.head()

Unnamed: 0,headline,is_sarcastic,length
0,former versace store clerk sues over secret 'b...,0,12
1,the 'roseanne' revival catches up to our thorn...,0,14
2,mom starting to fear son's web series closest ...,1,14
3,"boehner just wants wife to listen, not come up...",1,13
4,j.k. rowling wishes snape happy birthday in th...,0,11


In [124]:
trainX = train_headlines
trainy = data['is_sarcastic'][:21368]

testX = test_headlines
testy = data['is_sarcastic'][21368:]

In [125]:
save_data([trainX, trainy],"train.pkl")
save_data([testX, testy], "test.pkl")

saved train.pkl
saved test.pkl


## Encoding Data

In [126]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from pickle import load

In [127]:
def load_data(filename):
    return load(open(filename,'rb'))


In [128]:
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

In [129]:
def encode_text(tokenizer, lines, length):
    encoded = tokenizer.texts_to_sequences(lines)

    padded = pad_sequences(encoded, maxlen=length, padding='post')

    return padded

In [130]:
def max_length(lines):
    return max([len(s) for s in lines])


## create model

In [107]:
from keras.layers import Dense, Flatten, Input, Embedding, Dropout
from keras.models import Model
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers.merge import concatenate
from numpy import array

In [108]:
def define_model(length, vocab_size):
    # channel 1
    inputs1 = Input(shape=(length,))
    embedding1 = Embedding(vocab_size, 100)(inputs1)
    conv1 = Conv1D(filters=32, kernel_size=4, activation='relu')(embedding1)
    drop1 = Dropout(0.5)(conv1)
    pool1 = MaxPooling1D(pool_size=2)(drop1)
    flat1 = Flatten()(pool1)
    
    # channel 2
    inputs2 = Input(shape=(length,))
    embedding2 = Embedding(vocab_size, 100)(inputs2)
    conv2 = Conv1D(filters=32, kernel_size=6, activation='relu')(embedding2)
    drop2 = Dropout(0.5)(conv2)
    pool2 = MaxPooling1D(pool_size=2)(drop2)
    flat2 = Flatten()(pool2)
   # channel 3
    
    inputs3 = Input(shape=(length,))
    embedding3 = Embedding(vocab_size, 100)(inputs3)
    conv3 = Conv1D(filters=32, kernel_size=8, activation='relu')(embedding3)
    drop3 = Dropout(0.5)(conv3)
    pool3 = MaxPooling1D(pool_size=2)(drop3)
    flat3 = Flatten()(pool3)
    # merge
    
    merged = concatenate([flat1, flat2, flat3])
    # interpretation
    
    dense1 = Dense(10, activation='relu')(merged)
    outputs = Dense(1, activation='sigmoid')(dense1)
    model = Model(inputs=[inputs1, inputs2, inputs3], outputs=outputs)
    # compile
    
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    # summarize
    print(model.summary())
    # plot_model(model, show_shapes=True, to_file='multichannel.png')
    return model


In [131]:
trainlines , trainlabels = load_data('train.pkl')

In [132]:
tokenizer = create_tokenizer(trainlines)

In [133]:
length = max_length(trainlines)

vocab_size = len(tokenizer.word_index) + 1

In [134]:
trainX = encode_text(tokenizer,trainlines,length)

In [135]:
model = define_model(length,vocab_size)
model.fit([trainX,trainX,trainX],array(trainlabels),epochs=10)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_19 (InputLayer)           (None, 27)           0                                            
__________________________________________________________________________________________________
input_20 (InputLayer)           (None, 27)           0                                            
__________________________________________________________________________________________________
input_21 (InputLayer)           (None, 27)           0                                            
__________________________________________________________________________________________________
embedding_19 (Embedding)        (None, 27, 100)      2525400     input_19[0][0]                   
__________________________________________________________________________________________________
embedding_

<keras.callbacks.History at 0x21292c9feb8>

In [136]:
testlines, testlabels = load_data('test.pkl')

In [138]:
testX = encode_text(tokenizer, testlines, length)



In [139]:
loss, acc = model.evaluate([testX,testX,testX],array(testlabels),verbose=0)
print("accuracy of test data is {}".format(acc*100))

accuracy of test data is 79.70417525254481


In [140]:
loss, acc = model.evaluate([trainX,trainX,trainX],array(trainlabels),verbose=0)
print("accuracy of train data is {}".format(acc*100))

accuracy of train data is 99.93916136278548


# now we will check on some random data

In [168]:
text = "i am intrested in meeting people who are not intrested in meeting people"

In [169]:
text = clean_data(text)

In [170]:
text = encode_text(tokenizer,[text],length)

In [171]:
model.predict([text,text,text])

array([[0.09421725]], dtype=float32)