In [26]:
import os
import sys
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
from collections import OrderedDict
import itertools as it
from functools import reduce 
import time

# Helper Functions

In [27]:
def k_fold_cross_validation(data, k):
    """
    params:
        data: This should be a pandas data frame
        k: This is an int indicating the folds of the data to be performed
           If k is 0, perform LOO cross-validation: TODO
           If k is 1, the data is both test and train
    output:
        train: This is a pandas data frame
        test: This is a pandas data frame
        cross: This is which kth fold that we have just yielded
    """
    if k == 1:
        yield(data, data, k)
        return()
    if k == 0: # TODO: double check
        for i, value in enumerate(data):
            train = data[:i].append(data[(i+1):])
            test = data[i:(i+1)]
            yield(train, test, i)
    size = len(data)
    for cross in range(k):
        start = int(cross*size/k)
        stop = int((cross+1)*size/k)
        train = data[:start].append(data[stop:])
        test = data[start:stop]
        yield(train,test,cross)

In [28]:
def run_algorithm(train, test, algorithm):
    """
    params:
        train: a pandas data frame of the data
        test: a pandas data frame of the data
        algorithm: a pointer to a function that takes in data and outputs predictions
        #   params:
        #       data: a pandas dataframe
        #       info: information from training. If not present, train the model
        #   outputs:
        #       values: if training, it will output the parameters learned during training
        #               if testing, it will output the confusion_matrix
    outputs:
        confusion_matrix: The confusion matrix of the boolean classification
        duration: The amount of time that this took to run
    """
    start = time.time()
    training_info = algorithm(train)
    confusion_matrix = algorithm(test, training_info)
    duration = time.time() - start
    return(confusion_matrix, duration)

In [29]:
def accuracy(confusion_matrix):
    """
    params:
        confusion_matrix: a dictionary where entries are of the form {(T/F,T/F):freq}
                          freq is the occurence of that prediction outcome
    ouputs:
        The output is a float between 0 and 1 indicating the overall accuracy of the \
        model given the binary confusion matrix.
    """
    correct = confusion_matrix[(True, True)]+confusion_matrix[(False, False)]
    return(correct/sum(list(confusion_matrix.values())))

# Import Data

In [30]:
data=pd.read_json('../data/Sarcasm_Headlines_Dataset.json', lines=True)
data=data[['headline','is_sarcastic']]
data.head()

Unnamed: 0,headline,is_sarcastic
0,former versace store clerk sues over secret 'b...,0
1,the 'roseanne' revival catches up to our thorn...,0
2,mom starting to fear son's web series closest ...,1
3,"boehner just wants wife to listen, not come up...",1
4,j.k. rowling wishes snape happy birthday in th...,0


In [31]:
# remove upper case, weird white space and punctuation
data['headline'] = data['headline'].apply(lambda x: x.lower())
data['headline'] = data['headline'].apply((lambda x: re.sub('[^a-zA-z0-9\s]','',x)))
data[['headline','is_sarcastic']].head()

Unnamed: 0,headline,is_sarcastic
0,former versace store clerk sues over secret bl...,0
1,the roseanne revival catches up to our thorny ...,0
2,mom starting to fear sons web series closest t...,1
3,boehner just wants wife to listen not come up ...,1
4,jk rowling wishes snape happy birthday in the ...,0


# Let's Do This Without Removing Stopwords

In [32]:
# We shall split the input and output into two np arrays
x, y = (data['headline'].values, data['is_sarcastic'].values)

In [33]:
# We need a way to get the length of the headlines in # of words
mylen = np.vectorize(len)
print(mylen(x))

[76 81 78 ... 21 60 33]


In [34]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
# These next few lines maps the words in each headline to a unique integer
tk = Tokenizer()
tk.fit_on_texts(x)
x_seq = tk.texts_to_sequences(x)
print(mylen(x_seq))
print(max(mylen(x_seq)))

[12 14 14 ...  3  8  6]
39


In [40]:
# Now we pad headlines that are short with meaningless 0's
x_pad = pad_sequences(x_seq, maxlen=max(mylen(x_seq)), padding='post')
x_pad[:2]

array([[  293, 14674,   795,  3551,  2241,    47,   359,    92,  2138,
            5,  2509,  8285,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0],
       [    3,  8286,  3282,  2684,    27,     1,   155,  8287,   392,
         2874,     5,   240,     8,   950,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0]])

In [55]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout
vocabulary_size = len(tk.word_counts.keys())+1
max_words = max(mylen(x_seq))
embedding_size = 32
model = Sequential()
model.add(Embedding(vocabulary_size, embedding_size, input_length=max_words))
model.add(LSTM(50))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [46]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_pad, y, test_size = 0.25, random_state = 1)

In [56]:
model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=3)

Train on 20031 samples, validate on 6678 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x2036787d588>

In [57]:
scores = model.evaluate(x_test, y_test, verbose=0)
print("Accuracy: ", scores[1])

Accuracy:  0.8493560947283687
