In [21]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [22]:
df = pd.read_json("E:/Sarcasm_Headlines_Dataset_v2.json", lines=True)
df.head()

Unnamed: 0,is_sarcastic,headline,article_link
0,1,thirtysomething scientists unveil doomsday clo...,https://www.theonion.com/thirtysomething-scien...
1,0,dem rep. totally nails why congress is falling...,https://www.huffingtonpost.com/entry/donna-edw...
2,0,eat your veggies: 9 deliciously different recipes,https://www.huffingtonpost.com/entry/eat-your-...
3,1,inclement weather prevents liar from getting t...,https://local.theonion.com/inclement-weather-p...
4,1,mother comes pretty close to using word 'strea...,https://www.theonion.com/mother-comes-pretty-c...


In [23]:
del df['article_link']
df.head()

Unnamed: 0,is_sarcastic,headline
0,1,thirtysomething scientists unveil doomsday clo...
1,0,dem rep. totally nails why congress is falling...
2,0,eat your veggies: 9 deliciously different recipes
3,1,inclement weather prevents liar from getting t...
4,1,mother comes pretty close to using word 'strea...


In [24]:
df.duplicated().sum()

116

In [25]:
df.drop_duplicates(inplace=True)

In [26]:
df.head()

Unnamed: 0,is_sarcastic,headline
0,1,thirtysomething scientists unveil doomsday clo...
1,0,dem rep. totally nails why congress is falling...
2,0,eat your veggies: 9 deliciously different recipes
3,1,inclement weather prevents liar from getting t...
4,1,mother comes pretty close to using word 'strea...


In [27]:
df.rename(columns={'is_sarcastic': 'label'}, inplace=True)
df.head()

Unnamed: 0,label,headline
0,1,thirtysomething scientists unveil doomsday clo...
1,0,dem rep. totally nails why congress is falling...
2,0,eat your veggies: 9 deliciously different recipes
3,1,inclement weather prevents liar from getting t...
4,1,mother comes pretty close to using word 'strea...


lab_to_sarcasm = {0:"Not Sarcastic", 1:"Sarcastic"}
def label_decoder(label):
  return lab_to_sarcasm[label]
df.is_sarcastic = df.is_sarcastic.apply(lambda x: label_decoder(x))
df.head()

# Split & Train

In [28]:
import tensorflow as tf
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, EarlyStoppingCallback, TextClassificationPipeline
from datasets import Dataset
import evaluate

In [29]:
sentences = df['headline']
label = df['label']

In [30]:
TRAIN_SIZE = 0.8

In [31]:
train_data, test_data = train_test_split(df, test_size=1-TRAIN_SIZE,
                                         random_state=42) # Splits Dataset into Training and Testing set
print("Train Data size:", len(train_data))
print("Test Data size", len(test_data))

Train Data size: 22802
Test Data size 5701


In [32]:
hg_train_data = Dataset.from_pandas(train_data)
hg_test_data = Dataset.from_pandas(test_data)

In [33]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [34]:
def tokenize(data):
   return tokenizer(data["headline"], max_length=30, truncation=True, padding="max_length")
 
train_data = hg_train_data.map(tokenize)
test_data = hg_test_data.map(tokenize)

Map:   0%|          | 0/22802 [00:00<?, ? examples/s]

Map:   0%|          | 0/5701 [00:00<?, ? examples/s]

In [22]:
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [23]:
training_args = TrainingArguments(
    output_dir="./sarcasm_transformer/",          
    logging_dir='./sarcasm_transformer/logs',            
    logging_strategy='epoch',
    logging_steps=100,    
    num_train_epochs=3,              
    per_device_train_batch_size=8,  
    per_device_eval_batch_size=8,  
    learning_rate=5e-6,
    seed=42,
    save_strategy='epoch',
    save_steps=100,
    evaluation_strategy='epoch',
    eval_steps=100,
    load_best_model_at_end=True
)



In [24]:
def compute_metrics(eval_pred):
    metric = evaluate.load("accuracy")
    logits, labels = eval_pred
    # probabilities = tf.nn.softmax(logits)
    predictions = np.argmax(logits, axis=1)
    return metric.compute(predictions=predictions, references=labels)

In [25]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=test_data,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=1)]
)

trainer.train()

  0%|          | 0/475107 [00:00<?, ?it/s]

IndexError: Target 4 is out of bounds.

In [None]:
tokenizer.save_pretrained('./sarcasm_tokenizer_transformer/')
trainer.save_model('./sarcasm_trainer_transformer/')

import pickle
with open(r'E:/Research ML/LSTM_SarcasmDetection.pkl', 'wb') as file:  
    pickle.dump(model, file)