In [102]:
import pandas as pd
import numpy as np

import tensorflow as tf 
from tensorflow.keras.preprocessing.text import Tokenizer

In [103]:
df = pd.read_json('sarcasm.json', lines= True)

In [104]:
df.shape

(28619, 3)

In [105]:
df['headline'][3]

'inclement weather prevents liar from getting to work'

In [106]:
df

Unnamed: 0,is_sarcastic,headline,article_link
0,1,thirtysomething scientists unveil doomsday clo...,https://www.theonion.com/thirtysomething-scien...
1,0,dem rep. totally nails why congress is falling...,https://www.huffingtonpost.com/entry/donna-edw...
2,0,eat your veggies: 9 deliciously different recipes,https://www.huffingtonpost.com/entry/eat-your-...
3,1,inclement weather prevents liar from getting t...,https://local.theonion.com/inclement-weather-p...
4,1,mother comes pretty close to using word 'strea...,https://www.theonion.com/mother-comes-pretty-c...
...,...,...,...
28614,1,jews to celebrate rosh hashasha or something,https://www.theonion.com/jews-to-celebrate-ros...
28615,1,internal affairs investigator disappointed con...,https://local.theonion.com/internal-affairs-in...
28616,0,the most beautiful acceptance speech this week...,https://www.huffingtonpost.com/entry/andrew-ah...
28617,1,mars probe destroyed by orbiting spielberg-gat...,https://www.theonion.com/mars-probe-destroyed-...


In [107]:
sent = ['I am learning Python', 'I am learning Deep Learning', 'I love dogs', 'I love cats' ]

In [108]:
sent[1]

'I am learning Deep Learning'

In [109]:
type(sent)

list

In [110]:
sent

['I am learning Python',
 'I am learning Deep Learning',
 'I love dogs',
 'I love cats']

In [111]:
token  = Tokenizer(10, )

In [112]:
token.fit_on_texts(sent)

In [113]:
token.word_index

{'i': 1,
 'learning': 2,
 'am': 3,
 'love': 4,
 'python': 5,
 'deep': 6,
 'dogs': 7,
 'cats': 8}

In [114]:
sent_seq = token.texts_to_sequences(sent)

In [115]:
print(sent_seq)

[[1, 3, 2, 5], [1, 3, 2, 6, 2], [1, 4, 7], [1, 4, 8]]


In [116]:
sent = ['I am learning Python', 'I am learning Deep Learning', 'I love dogs', 'I love cats' ]

In [117]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [118]:
sent_final = pad_sequences( sent_seq, truncating= 'post', padding= 'post', maxlen= 10 )

In [119]:
sent_final

array([[1, 3, 2, 5, 0, 0, 0, 0, 0, 0],
       [1, 3, 2, 6, 2, 0, 0, 0, 0, 0],
       [1, 4, 7, 0, 0, 0, 0, 0, 0, 0],
       [1, 4, 8, 0, 0, 0, 0, 0, 0, 0]])

## Project : Sarcasm Detection

In [120]:
# 1. Convert headlines into a list.
# 2. Convert all labels into a list


In [121]:
# 
sentences = []
labels    = []

df.shape

(28619, 3)

In [122]:
sentences = df['headline'].to_list()

In [123]:
labels = df['is_sarcastic'].to_list()

In [124]:
# Create Training and Testing Datasets   => 90 : 10 

In [125]:
train = int(df.shape[0] * 90 / 100)
train

25757

In [126]:
# Training Dataset
sentences_train = sentences[ : train]
labels_train    = labels[ : train]

In [127]:
# Testing Dataset

sentences_test = sentences[train : ]
labels_test   = labels[train : ]

In [128]:
# Create Word Index

In [129]:
token = Tokenizer(num_words= 10000, oov_token= 'UNK')

In [130]:
token.fit_on_texts(sentences_train)

In [131]:
# token.word_index

In [132]:
train_seq = pad_sequences(token.texts_to_sequences(sentences_train),
                          maxlen= 50, 
                          padding= 'post', 
                          truncating= 'post')

In [133]:
train_seq.shape

(25757, 50)

In [134]:
test_seq = pad_sequences(token.texts_to_sequences(sentences_test),
                          maxlen= 50, 
                          padding= 'post', 
                          truncating= 'post')

In [135]:
test_seq.shape

(2862, 50)

In [136]:
# Convert labels also in array
train_label = np.array(labels_train)
test_label = np.array(labels_test)

In [137]:
test_label

array([0, 1, 0, ..., 0, 1, 1])

In [138]:
# Build a Model

In [139]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Embedding, GlobalAvgPool1D, Flatten


In [140]:
model = Sequential()

In [141]:
# Input Layer
model.add(Embedding(10000, input_length= 50, output_dim = 16))
model.add(GlobalAvgPool1D())

# First Hidden Layer
model.add(Dense(128, activation= 'relu'))
model.add(Dropout(0.25))

# Second Hidden Layer
model.add(Dense(64, activation= 'relu'))
model.add(Dropout(0.25))

# Output Layer
model.add(Dense(1, activation= 'sigmoid'))

In [142]:
model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 50, 16)            160000    
                                                                 
 global_average_pooling1d_1  (None, 16)                0         
  (GlobalAveragePooling1D)                                       
                                                                 
 dense_6 (Dense)             (None, 128)               2176      
                                                                 
 dropout_4 (Dropout)         (None, 128)               0         
                                                                 
 dense_7 (Dense)             (None, 64)                8256      
                                                                 
 dropout_5 (Dropout)         (None, 64)                0         
                                                      

In [143]:
# Compile model
model.compile(optimizer= 'adam', loss= 'binary_crossentropy', metrics = ['accuracy'])

In [144]:
# Train the Model
model.fit(train_seq, train_label, epochs= 10, validation_data= (test_seq, test_label))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x15353c72f10>

In [150]:
test = ["Where are women judges in high courts ?"]

test = pad_sequences(token.texts_to_sequences(test),
                          maxlen= 50, 
                          padding= 'post', 
                          truncating= 'post')

In [151]:
model.predict(test).round()



array([[1.]], dtype=float32)