In [2]:
import numpy as np
import pandas as pd

In [3]:
import json

def parse_data(file):
    for l in open(file,'r'):
        yield json.loads(l)

data = list(parse_data('Sarcasm_Headlines_Dataset_v2.json'))
data[:10]

[{'is_sarcastic': 1,
  'headline': 'thirtysomething scientists unveil doomsday clock of hair loss',
  'article_link': 'https://www.theonion.com/thirtysomething-scientists-unveil-doomsday-clock-of-hai-1819586205'},
 {'is_sarcastic': 0,
  'headline': 'dem rep. totally nails why congress is falling short on gender, racial equality',
  'article_link': 'https://www.huffingtonpost.com/entry/donna-edwards-inequality_us_57455f7fe4b055bb1170b207'},
 {'is_sarcastic': 0,
  'headline': 'eat your veggies: 9 deliciously different recipes',
  'article_link': 'https://www.huffingtonpost.com/entry/eat-your-veggies-9-delici_b_8899742.html'},
 {'is_sarcastic': 1,
  'headline': 'inclement weather prevents liar from getting to work',
  'article_link': 'https://local.theonion.com/inclement-weather-prevents-liar-from-getting-to-work-1819576031'},
 {'is_sarcastic': 1,
  'headline': "mother comes pretty close to using word 'streaming' correctly",
  'article_link': 'https://www.theonion.com/mother-comes-pretty-

In [4]:
headline = []
is_sarcastic = []
for item in data:
    headline.append(item['headline'])
    is_sarcastic.append(item['is_sarcastic'])
df = pd.DataFrame()
df["headline"] = headline
df["is_sarcastic"] = is_sarcastic
df

Unnamed: 0,headline,is_sarcastic
0,thirtysomething scientists unveil doomsday clo...,1
1,dem rep. totally nails why congress is falling...,0
2,eat your veggies: 9 deliciously different recipes,0
3,inclement weather prevents liar from getting t...,1
4,mother comes pretty close to using word 'strea...,1
...,...,...
28614,jews to celebrate rosh hashasha or something,1
28615,internal affairs investigator disappointed con...,1
28616,the most beautiful acceptance speech this week...,0
28617,mars probe destroyed by orbiting spielberg-gat...,1


In [5]:
import spacy
nlp = spacy.load("en_core_web_sm")

def preprocess_text(text):
    tokens = nlp(text)
    count = 0
    tokens = [token.lemma_ for token in tokens if token.is_alpha and not token.is_stop]
    text = ' '.join(tokens)
    return text

df['headline'] = df['headline'].apply(preprocess_text)

In [7]:
df

Unnamed: 0,headline,is_sarcastic
0,thirtysomething scientist unveil doomsday cloc...,1
1,dem rep totally nail congress fall short gende...,0
2,eat veggie deliciously different recipe,0
3,inclement weather prevent liar get work,1
4,mother come pretty close word stream correctly,1
...,...,...
28614,jews celebrate rosh hashasha,1
28615,internal affair investigator disappoint conspi...,1
28616,beautiful acceptance speech week come queer ko...,0
28617,mars probe destroy orbit spielberg gate space ...,1


In [8]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer

X_train, X_test, y_train, y_test = train_test_split(df['headline'], df['is_sarcastic'], test_size=0.2, random_state=42)
vectorizer = CountVectorizer()

X_train_count = vectorizer.fit_transform(X_train)
X_test_count = vectorizer.transform(X_test)

model = LogisticRegression()

model.fit(X_train_count, y_train)

y_pred = model.predict(X_test_count)

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

Accuracy: 0.7938504542278128


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

from torch.nn.utils.rnn import pad_sequence

class SarcasmDataset(Dataset):
    def __init__(self, headlines, labels, vectorizer):
        self.headlines = headlines
        self.labels = labels
        self.vectorizer = vectorizer
        self.vectorizer.fit(headlines)

    def __len__(self):
        return len(self.headlines)

    def __getitem__(self, idx):
        headline = self.headlines[idx]
        label = self.labels[idx]
        vectorized = torch.tensor(self.vectorizer.transform([headline]).toarray(), dtype=torch.float32).squeeze(0)
        return vectorized, torch.tensor(label, dtype=torch.float32)

X_train, X_test, y_train, y_test = train_test_split(df['headline'], df['is_sarcastic'], test_size=0.2, random_state=42)
vectorizer = CountVectorizer()

train_dataset = SarcasmDataset(X_train.tolist(), y_train.tolist(), vectorizer)
test_dataset = SarcasmDataset(X_test.tolist(), y_test.tolist(), vectorizer)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

class SarcasmLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(SarcasmLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)
        lstm_out = lstm_out[:, -1, :]  # Get the output of the last LSTM cell
        out = self.fc(lstm_out)
        return self.sigmoid(out)

# Initialize model, loss, and optimizer
vocab_size = len(vectorizer.vocabulary_)
embedding_dim = 100
hidden_dim = 128
output_dim = 1

model = SarcasmLSTM(vocab_size, embedding_dim, hidden_dim, output_dim)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

def train_model(model, train_loader, criterion, optimizer, num_epochs=10):
    model.train()
    for epoch in range(num_epochs):
        epoch_loss = 0
        for inputs, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs.long())
            loss = criterion(outputs.squeeze(), labels)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
        print(f'Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss/len(train_loader)}')

def evaluate_model(model, test_loader):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for inputs, labels in test_loader:
            outputs = model(inputs.long())
            preds = outputs.squeeze().round().cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(labels.cpu().numpy())
    accuracy = accuracy_score(all_labels, all_preds)
    print(f'Accuracy: {accuracy}')

train_model(model, train_loader, criterion, optimizer, num_epochs=1)
evaluate_model(model, test_loader)
