In [75]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from tqdm import tqdm
from torch.utils.data import TensorDataset, DataLoader

In [16]:
df = pd.read_json(r"C:\Users\Gokul\Downloads\sarcasm_kaggle\Sarcasm_Headlines_Dataset_v2.json", lines=True)
df.head()

Unnamed: 0,is_sarcastic,headline,article_link
0,1,thirtysomething scientists unveil doomsday clo...,https://www.theonion.com/thirtysomething-scien...
1,0,dem rep. totally nails why congress is falling...,https://www.huffingtonpost.com/entry/donna-edw...
2,0,eat your veggies: 9 deliciously different recipes,https://www.huffingtonpost.com/entry/eat-your-...
3,1,inclement weather prevents liar from getting t...,https://local.theonion.com/inclement-weather-p...
4,1,mother comes pretty close to using word 'strea...,https://www.theonion.com/mother-comes-pretty-c...


In [17]:
def data_drop(column_name, data):
    data1 = data.drop([column_name], axis = 1)
    return data1

In [18]:
print(df.shape)
df = df.drop_duplicates()
print(df.shape)

(28619, 3)
(28617, 3)


In [19]:
df = data_drop('article_link', df)
df.head()

Unnamed: 0,is_sarcastic,headline
0,1,thirtysomething scientists unveil doomsday clo...
1,0,dem rep. totally nails why congress is falling...
2,0,eat your veggies: 9 deliciously different recipes
3,1,inclement weather prevents liar from getting t...
4,1,mother comes pretty close to using word 'strea...


In [27]:
y_data = df.is_sarcastic
x_data = data_drop('is_sarcastic', df)
print(x_data.size == y_data.size)

True


In [42]:
from string import punctuation

x_list = x_data['headline'].tolist()
x_list = [x.lower() for x in x_list]
def remove_punc(string):
    for i in string:
        if i in punctuation:
            string = string.replace(i, "")
    return string
lines_split = [remove_punc(c) for c in x_list]
print(lines_split)
# splitting using \n
words = [word for line in lines_split for word in line.split()]
print(words[:20])

['thirtysomething', 'scientists', 'unveil', 'doomsday', 'clock', 'of', 'hair', 'loss', 'dem', 'rep', 'totally', 'nails', 'why', 'congress', 'is', 'falling', 'short', 'on', 'gender', 'racial']


In [43]:
from collections import Counter

counts = Counter(words)
#print(counts)
vocab = sorted(counts, key=counts.get, reverse=True)
#print(vocab)
vocab2int = {word:i for i, word in enumerate(vocab, 1)}
#print(vocab2int)
lines_ints= []
for line in lines_split:
    lines_ints.append([vocab2int[word] for word in line.split()])
print(lines_ints)

[[15526, 336, 3106, 6323, 2320, 2, 671, 1138], [7344, 1731, 732, 3107, 46, 226, 10, 1886, 1066, 7, 1659, 2102, 1732], [892, 34, 11206, 615, 15527, 606, 1447], [11207, 1596, 6324, 4497, 14, 138, 1, 143], [443, 470, 286, 999, 1, 558, 616, 4114, 6325], [78, 69, 8885], [100, 256, 1, 4115, 34, 2452, 9, 559, 1504], [1806, 15528, 11208, 3108, 1106, 24, 220, 24, 1597, 2, 1399, 3298, 4116], [2916, 293, 138, 128, 1807, 1, 425, 4, 15529, 1558, 340, 2453], [2917, 2, 132, 107, 20, 7345], [20, 2586, 10, 3109, 6, 453, 4, 3525, 8, 64, 965, 43, 15, 23], [7346, 11209, 617, 59, 515, 225, 10, 6, 618, 3299], [33, 1, 107, 11210, 1660, 11211, 5, 1107], [560, 1559, 943, 142, 82, 72, 1, 1661, 5, 8886, 15530, 7347, 2, 1139, 2, 11212], [1505, 4117, 11, 6326, 23, 1448, 11213, 7, 8887], [1361, 365, 7348, 8888, 4970, 77, 14, 12, 1662, 7349, 2103], [48, 397, 1140, 15531, 3300], [48, 13, 305, 96, 2, 30, 2454, 18, 11214], [535, 54, 415, 9, 3301, 329], [3791, 2918, 94, 56, 1, 2455, 17, 1, 405, 34, 498, 250], [1598, 144

In [44]:
# stats about vocabulary
print('Unique words: ', len((vocab2int)))

# print tokens in first review
print('Tokenized review: \n', lines_ints[:1])

Unique words:  29667
Tokenized review: 
 [[15526, 336, 3106, 6323, 2320, 2, 671, 1138]]


In [46]:
encoded_labels = y_data.to_numpy()
print(encoded_labels, encoded_labels.shape)

[1 0 0 ... 0 1 1] (28617,)


In [56]:
lines_lengths = Counter([len(x) for x in lines_ints])
print("Zero-length lines: {}".format(lines_lengths[0]))
print("Maximum sarcasm line length: {}".format(max(lines_lengths)))
print(lines_lengths)

Zero-length lines: 0
Maximum sarcasm line length: 151
Counter({10: 3643, 9: 3433, 11: 3392, 8: 2935, 12: 2841, 7: 2407, 13: 2079, 6: 1777, 14: 1481, 5: 1170, 15: 952, 4: 594, 16: 579, 17: 379, 3: 307, 18: 225, 19: 142, 2: 119, 20: 63, 21: 45, 22: 23, 23: 13, 24: 3, 27: 3, 29: 2, 25: 2, 28: 2, 151: 1, 31: 1, 26: 1, 38: 1, 39: 1, 30: 1})


In [57]:
def pad_features(lines_ints, seq_length):
    features = np.zeros((len(lines_ints), seq_length), dtype=int)
    for i, row in enumerate(lines_ints):
        features[i, -len(row):] = np.array(row)[:seq_length]

    return features

In [63]:
seq_length = 35
features = pad_features(lines_ints, seq_length=seq_length)

## test statements
print(len(features)==len(lines_ints)) #Your features should have as many rows as reviews.
print(len(features[0])==seq_length) #Each feature row should contain seq_length values.

# print first 10 values of the first 30 batches
print(features[:-30,:-10])

True
True
[[   0    0    0 ...    0    0    0]
 [   0    0    0 ... 7344 1731  732]
 [   0    0    0 ...    0    0    0]
 ...
 [   0    0    0 ...    0    0  106]
 [   0    0    0 ...    0    0    0]
 [   0    0    0 ...    0    0    0]]


In [64]:
split_frac = 0.8
split_idx = int(len(features)*split_frac)
train_x, remaining_x = features[:split_idx], features[split_idx:]
train_y, remaining_y = encoded_labels[:split_idx], encoded_labels[split_idx:]

half_idx = int(len(remaining_x)*0.5)
valid_x, test_x = remaining_x[:half_idx], remaining_x[half_idx:]
valid_y, test_y = remaining_y[:half_idx], remaining_y[half_idx:]

print("Train set: \t\t{}".format(train_x.shape),
      "\nValidation set: \t{}".format(valid_x.shape),
      "\nTest set: \t\t{}".format(test_x.shape))

Train set: 		(22893, 35) 
Validation set: 	(2862, 35) 
Test set: 		(2862, 35)


In [83]:
# Creating Tensor Datasets
train_data = TensorDataset(torch.from_numpy(train_x),torch.from_numpy(train_y))
valid_data = TensorDataset(torch.from_numpy(valid_x),torch.from_numpy(valid_y))
test_data = TensorDataset(torch.from_numpy(test_x),torch.from_numpy(test_y))

# Dataloaders
batch_size = 50

train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size, drop_last=True)
valid_loader = DataLoader(valid_data, shuffle=True, batch_size=batch_size, drop_last=True)
test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size, drop_last=True)

In [77]:
# obtain one batch of training data
example = iter(train_loader)
sample_x, sample_y = example.next()

print('Sample input size: ', sample_x.size()) # batch_size, seq_length
print('Sample input: \n', sample_x)
print()
print('Sample label size: ', sample_y.size()) # batch_size
print('Sample label: \n', sample_y)

Sample input size:  torch.Size([50, 35])
Sample input: 
 tensor([[    0,     0,     0,  ...,     4,  1972,  6715],
        [    0,     0,     0,  ...,  1154,   139,  1419],
        [    0,     0,     0,  ...,   143,     5,  3540],
        ...,
        [    0,     0,     0,  ...,    49,  1932,    17],
        [    0,     0,     0,  ...,   859,  1987, 23478],
        [    0,     0,     0,  ...,  2883,    34,   494]], dtype=torch.int32)

Sample label size:  torch.Size([50])
Sample label: 
 tensor([0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0,
        1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1,
        1, 0])


In [78]:
class RNN(nn.Module):
    def __init__(self, vocab_size, output_size, embed_dim, hidden_dim, n_layers, drop_prob=0.5):
        super(RNN, self).__init__()
        self.output_size = output_size
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers

        # Embedding and LSTM
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, n_layers, dropout=drop_prob, batch_first=True)
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(hidden_dim, output_size)
        self.sig = nn.Sigmoid()

    def forward(self, x, hidden):
        batch_size = x.size(0)

        x = x.long()
        embeds = self.embedding(x)
        lstm_out, hidden = self.lstm(embeds, hidden)
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)
        # Dropout and FC after reshaping
        out = self.dropout(lstm_out)
        out = self.fc(out)
        sig_out = self.sig(out)
        # Reshape to batch size first
        sig_out = sig_out.view(batch_size, -1)
        sig_out = sig_out[:, -1] # get last batch of labels

        return sig_out, hidden

    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_(),
                  weight.new(self.n_layers, batch_size, self.hidden_dim).zero_())
        return hidden

In [79]:
# Hyperparameters
vocab_size = len(vocab2int)+1
output_size = 1
embed_dim = 400
hidden_dim = 256
n_layers = 2
epochs = 1
lr = 0.001

model = RNN(vocab_size, output_size, embed_dim, hidden_dim, n_layers)
print(model)

RNN(
  (embedding): Embedding(29668, 400)
  (lstm): LSTM(400, 256, num_layers=2, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.3, inplace=False)
  (fc): Linear(in_features=256, out_features=1, bias=True)
  (sig): Sigmoid()
)


In [80]:
# Loss and optimizer
loss_func = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

In [81]:
counter = 0
print_every = 10
clip = 5

model.train()
# Training loop
for epoch in tqdm(range(epochs)):
    h = model.init_hidden(batch_size)

    for inputs, labels in train_loader:
        counter += 1
        h = tuple([each.data for each in h])

        output, h = model(inputs, h)
        loss = loss_func(output.squeeze(), labels.float())

        model.zero_grad()
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()

        # loss stats
        if counter%print_every == 0:
            val_h = model.init_hidden(batch_size)
            val_losses = []
            model.eval()
            for inputs, labels in valid_loader:
                val_h = tuple([each.data for each in val_h])
                output, val_h = model(inputs, val_h)
                val_loss = loss_func(output.squeeze(), labels.float())
                val_losses.append(val_loss.item())

            model.train()
            print('Epoch: {}/{}...'.format(epoch+1, epochs),
                  'Step: {}...'.format(counter),
                  'Loss: {:.6f}...'.format(loss.item()),
                  'Valid_loss: {:.6f}...'.format(np.mean(val_losses)))

100%|██████████| 1/1 [07:02<00:00, 422.99s/it]


Epoch: 1/1... Step: 10... Loss: 0.683962... Valid_loss: 0.671224...
Epoch: 1/1... Step: 20... Loss: 0.598524... Valid_loss: 0.580768...
Epoch: 1/1... Step: 30... Loss: 0.547371... Valid_loss: 0.560788...
Epoch: 1/1... Step: 40... Loss: 0.473554... Valid_loss: 0.539025...
Epoch: 1/1... Step: 50... Loss: 0.438349... Valid_loss: 0.526397...
Epoch: 1/1... Step: 60... Loss: 0.588752... Valid_loss: 0.522343...
Epoch: 1/1... Step: 70... Loss: 0.436827... Valid_loss: 0.510954...
Epoch: 1/1... Step: 80... Loss: 0.443609... Valid_loss: 0.494042...
Epoch: 1/1... Step: 90... Loss: 0.329247... Valid_loss: 0.492629...
Epoch: 1/1... Step: 100... Loss: 0.622796... Valid_loss: 0.471140...
Epoch: 1/1... Step: 110... Loss: 0.374756... Valid_loss: 0.460190...
Epoch: 1/1... Step: 120... Loss: 0.418353... Valid_loss: 0.464494...
Epoch: 1/1... Step: 130... Loss: 0.447154... Valid_loss: 0.456816...
Epoch: 1/1... Step: 140... Loss: 0.458779... Valid_loss: 0.448546...
Epoch: 1/1... Step: 150... Loss: 0.449444..

In [84]:
# Testing
test_losses = []
num_correct = 0

h = model.init_hidden(batch_size)
model.eval()
for inputs, labels in test_loader:
    h = tuple([each.data for each in h])
    output, h = model(inputs, h)
    test_loss = loss_func(output.squeeze(), labels.float())
    test_losses.append(test_loss.item())

    # converting output probabilities to predicted class (0 or 1)
    pred = torch.round(output.squeeze()) # round to nearest integer

    # compare pred with true label
    correct_tensor = pred.eq(labels.float().view_as(pred))
    correct = np.squeeze(correct_tensor.numpy())
    num_correct += np.sum(correct)

# avg test loss
print("Test loss: {:.3f}".format(np.mean(test_losses)))

# accuracy over all test data
test_acc = num_correct/len(test_loader.dataset)
print("Test accuracy: {:.3f}".format(test_acc))

Test loss: 0.360
Test accuracy: 0.838


In [126]:
# Inference

def tokenize_line(test_line):
    test_line = test_line.lower()
    test_text = ''.join([c for c in test_line if c not in punctuation])
    test_words = test_text.split()

    test_ints = []
    test_ints.append([vocab2int[word] for word in test_words])

    return test_ints

In [127]:
def predict(model, test_line, sequence_length = 35):
    model.eval()
    test_ints = tokenize_line(test_line)
    seq_length = sequence_length
    features = pad_features(test_ints, seq_length)
    feature_tensor = torch.from_numpy(features)

    batch_size = feature_tensor.size(0)
    h = model.init_hidden(batch_size)
    output, h = model(feature_tensor, h)
    pred = torch.round(output.squeeze())

    # printing output value, before rounding
    print('Prediction value, pre-rounding: {:.6f}'.format(output.item()))
    # print response
    if(pred.item()==1):
        print("Sorry, you wouldn't get it!")
    else:
        print("Just news")

In [138]:
# negative test review
test_line = "boehner just wants wife to listen, not come up with alternative debt-reduction ideas"
# call function
seq_length = 35 # good to use the length that was trained on
predict(model, test_line, seq_length)



Prediction value, pre-rounding: 0.875984
Sorry, you wouldn't get it!
