### Training a Sarcasm Detection Model using a Convolution Layer

In [2]:
#importing libraries

import json
import matplotlib.pyplot as plt
import tensorflow as tf

In [3]:
filepath="D:\\Tensorflow_Works\\8-Sentiment_in_Text\\sarcasm.json"

with open(filepath, "r") as file:
    data = json.load(file)

data[0]    

{'article_link': 'https://www.huffingtonpost.com/entry/versace-black-code_us_5861fbefe4b0de3a08f600d5',
 'headline': "former versace store clerk sues over secret 'black code' for minority shoppers",
 'is_sarcastic': 0}

In [4]:
sentences = []
labels = []

for item in data:
    sentences.append(item['headline'])
    labels.append(item['is_sarcastic'])

    

In [5]:
#setting parameters to easily tweak them when needed

#number of examples to use for training
training_size = 20000

#vocabulary size of tokenizer
vocab_size=10000

#maximum length of the padded sequences
max_length = 32

#output dimensions of the embedding layer
embedding_dims = 16

In [6]:
#splitting data into train and test splits
train_sentences = sentences[: training_size]
test_sentences = sentences[training_size:]

train_labels = labels[:training_size]
test_labels = labels[training_size:]

In [7]:
#defining vectorize layer

vectorize_layer = tf.keras.layers.TextVectorization(max_tokens=vocab_size)

# generating vocabulary for training sentences
vectorize_layer.adapt(train_sentences)

In [8]:
#preprocess the train and test data

train_dataset = tf.data.Dataset.from_tensor_slices((train_sentences, train_labels))
test_dataset = tf.data.Dataset.from_tensor_slices((test_sentences, test_labels))

In [9]:
#defining preprocessing function

def preprocessing_func(dataset):

    dataset_sequences = dataset.map(lambda review, label: (vectorize_layer(review), label))

    dataset_sequences = dataset_sequences.ragged_batch(batch_size=dataset_sequences.cardinality())

    sequences, labels = dataset_sequences.get_single_element()

    padded_sequences = tf.keras.utils.pad_sequences(sequences.numpy(), padding="pre", truncating="post", maxlen=max_length)

    padded_sequences = tf.data.Dataset.from_tensor_slices(padded_sequences)

    labels = tf.data.Dataset.from_tensor_slices(labels)

    dataset_vectorized = tf.data.Dataset.zip(padded_sequences, labels)

    return dataset_vectorized


In [10]:
#preprocessing train and test datasets

train_dataset_vectorized = train_dataset.apply(preprocessing_func)
test_dataset_vectorized = test_dataset.apply(preprocessing_func)

In [11]:
#viewing 2 example sequences and its labels

for example in train_dataset_vectorized.take(2):
    print(example)
    print()

(<tf.Tensor: shape=(32,), dtype=int32, numpy=
array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,  319,    1,
        943, 4079, 2366,   47,  366,   94, 2026,    6, 2653, 9470],
      dtype=int32)>, <tf.Tensor: shape=(), dtype=int32, numpy=0>)

(<tf.Tensor: shape=(32,), dtype=int32, numpy=
array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    4, 7185, 3128, 3305,
         28,    2,  152,    1,  358, 2902,    6,  236,    9,  844],
      dtype=int32)>, <tf.Tensor: shape=(), dtype=int32, numpy=0>)



In [None]:
#optimizing datasets for training



In [None]:
# Parameters
EMBEDDING_DIM = 16
FILTERS = 128
KERNEL_SIZE = 5
DENSE_DIM = 6

# Model Definition with Conv1D
model_conv = tf.keras.Sequential([
    tf.keras.Input(shape=(max_length,)),
    tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=EMBEDDING_DIM),
    tf.keras.layers.Conv1D(FILTERS, KERNEL_SIZE, activation='relu'),
    tf.keras.layers.GlobalMaxPooling1D(),
    tf.keras.layers.Dense(DENSE_DIM, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

# Set the training parameters
model_conv.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

# Print the model summary
model_conv.summary()