In [274]:
pip install tensorflow

Note: you may need to restart the kernel to use updated packages.


In [275]:
# Import necessary libraries
import tensorflow as tf
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import os
import matplotlib.pyplot as plt
from collections import Counter

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report

## Loading and preprocessing the dataset

* Read the data from sarcasm.dat with comma as the separator
* Filter the dataframe, by taking rows that have a score greater than 10.
* Filter the dataframe further by only taking only the `label` and `comment` columns.
* Split the dataset into train and test split, with a test split of 0.2


In [276]:
# Read the data from train-balanced-sarcasm.csv.zip with comma as the separator
df = pd.read_csv('https://drive.google.com/uc?id=1D3Ma2HcEkkt85d5OtXxL5f8MHmrwvuQi&export=download', sep=',')

In [277]:
# Take a quick look at the dataframe
df.head(3)

Unnamed: 0,label,comment,score
0,1,This can't possibly go wrong.,29
1,1,I agree with the government...It would be MUCH...,2
2,0,There is just not enough people around that un...,1


In [278]:
# Keep only those rows of the dataframes with score>10


# Your code here
b = df['score'] > 10
df = df[b]

In [279]:
# Make a copy of the dataframe with columns label and comment using .copy() method

# Your code here
df_cp = df[['label', 'comment']].copy()
print(df_cp.head(3))

    label                                            comment
0       1                      This can't possibly go wrong.
18      1  Thank god we've got the European Union to coor...
25      1  That's very impressive work young men, but eve...


In [280]:
# Split the data into train and test sets with a test size of 0.2

# Your code here
data_train, data_test = train_test_split(df_cp, test_size=0.2)

## Character level tokenization and filtering of smaller/longer sentences


* Tokenize the data by fitting the train data with a vocabulary size of 100, using character-level tokenization instead of whitespace tokenization.
* Preprocess the data, so that the label column is of type int and the comments are represented as vectors got by tokenization.
* Filter tokenized comments that have less than 10 and more than 100 tokens for both the train and test sets.

In [281]:
data_train.comment = data_train.comment.str.replace("[^a-zA-Z0-9 ]",'', regex=True)
data_test.comment = data_test.comment.str.replace("[^a-zA-Z0-9 ]",'', regex=True)
print(data_train.comment)

354619                                 I run mine at 42 Psy
239770        Ghost is making light of my spiritual beliefs
434961    These werent inengine cutscenes they were enti...
139694                                   This is deplorable
429770    Oh that completely changes things he was only ...
                                ...                        
339149                                         LIKELY MASON
89781     Well I suspect the Republicans are experts on ...
322893      The empty desk has been doing a good job though
173868          But hes a guy youd want to have a beer with
324365      But thats what makes it the best game ever made
Name: comment, Length: 45423, dtype: object


In [282]:
# Define a tokenizer with vocabulary size of 100 that filters all special characters except whitespace and
# converts the characters to lowercase

# Your code here
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=100, filters=' !"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True, char_level=True)
# Fit the tokenizer on the comments column of the train set

# Your code here
tokenizer.fit_on_texts(data_train.comment)

In [283]:
print(data_train.comment)

354619                                 I run mine at 42 Psy
239770        Ghost is making light of my spiritual beliefs
434961    These werent inengine cutscenes they were enti...
139694                                   This is deplorable
429770    Oh that completely changes things he was only ...
                                ...                        
339149                                         LIKELY MASON
89781     Well I suspect the Republicans are experts on ...
322893      The empty desk has been doing a good job though
173868          But hes a guy youd want to have a beer with
324365      But thats what makes it the best game ever made
Name: comment, Length: 45423, dtype: object


In [284]:
# Print the tokens and their integer representation from the vocabulary of the tokenizer
tokenizer.word_index

{' ': 1,
 'e': 2,
 't': 3,
 'a': 4,
 'o': 5,
 'i': 6,
 's': 7,
 'n': 8,
 'r': 9,
 'h': 10,
 'l': 11,
 'd': 12,
 'u': 13,
 'c': 14,
 'y': 15,
 'm': 16,
 'g': 17,
 'w': 18,
 'p': 19,
 'b': 20,
 'f': 21,
 'k': 22,
 'v': 23,
 'j': 24,
 'x': 25,
 '0': 26,
 'z': 27,
 '1': 28,
 '2': 29,
 'q': 30,
 '3': 31,
 '5': 32,
 '4': 33,
 '9': 34,
 '6': 35,
 '8': 36,
 '7': 37}

In [285]:
# Convert the label column of the train and test
label_train = data_train.label.values
label_test = data_test.label.values

In [286]:
label_train[:20]

array([0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1])

In [287]:
# Convert the comments column to sequences

# Your code here
data_train = tokenizer.texts_to_sequences(data_train.comment)
data_test = tokenizer.texts_to_sequences(data_test.comment)


In [288]:
# Print the first element of the train set and the number of elements in train
print(data_train[0])
len(data_train)

[6, 1, 9, 13, 8, 1, 16, 6, 8, 2, 1, 4, 3, 1, 33, 29, 1, 19, 7, 15]


45423

In [289]:
# Print the first element of the test set and the number of elements in test set
print(data_test[0])
len(data_test)

[7, 5, 16, 2, 1, 3, 6, 16, 1, 20, 13, 9, 3, 5, 8, 1, 7, 10, 6, 3, 1, 9, 6, 17, 10, 3, 1, 3, 10, 2, 9, 2]


11356

In [290]:
# Compute the length of each sentence of the train and test set
N_train = np.array([len(i) for i in data_train])

# Your code here
N_test = np.array([len(i) for i in data_test])

In [291]:
# Create a boolean mask to filter out sentences which are less than 10 tokens/ more than 100 tokens.

# Your code here
b1 = N_train > 10
b2 = N_train < 100
b = np.logical_and(b1, b2)
N_train = N_train[b]
data_train = np.array(data_train)[b]
label_train = label_train[b]
b1 = N_test > 10
b2 = N_test < 100
b = np.logical_and(b1, b2)
N_test = N_test[b]
data_test = np.array(data_test)[b]
label_test = label_test[b]

  data_train = np.array(data_train)[b]
  data_test = np.array(data_test)[b]


In [292]:
# Print the number of elements in train set
len(data_train)

37555

In [293]:
# Print the number of elements in train set

len(data_test)

9496

## Tensorflow Dataset for training and validation datasets



* Build a Tensorflow dataset from the train and test data.
  * First, convert train and test data into ragged tensors.
  * Next, create a tensorflow dataset using the ragged tensors. Use `tf.data.Dataset.from_tensor_slices()` to slice the tensors along the first dimension and create individual data points in the dataset.
  * Further, we will shuffle and batch our dataset with a batch size of 512.
  * Next, we will have to pad the data. Use the `map()` function to pad every sentence in a batch to the same length.
  * Finally, use prefetching. The prefetch operation will decouple the preparation of the batch with the execution itself. Removing the waiting time of the network for more data.
  
Since, we are working with lots of sentences, padding the entire dataset to a common sequence length is wasteful.By following the above steps and building a TensorFlow dataset, we can do operations such as padding, shuffling and others on the go which helps reduce computation time.

* Take a single batch from the train dataset, and verify the pipeline works correctly by printing the shape of the input tensor. Try to print the shapes with a few different batches.



In [294]:
# Use tensorflow ragged constants to get the ragged version of the train data

# Your code here
ragged_train = tf.ragged.constant(data_train)
ragged_test = tf.ragged.constant(data_test)


In [295]:
label_train = tf.ragged.constant(label_train)
label_test = tf.ragged.constant(label_test)

In [296]:
ragged_train

<tf.RaggedTensor [[6, 1, 9, 13, 8, 1, 16, 6, 8, 2, 1, 4, 3, 1, 33, 29, 1, 19, 7, 15],
 [17, 10, 5, 7, 3, 1, 6, 7, 1, 16, 4, 22, 6, 8, 17, 1, 11, 6, 17, 10, 3, 1,
  5, 21, 1, 16, 15, 1, 7, 19, 6, 9, 6, 3, 13, 4, 11, 1, 20, 2, 11, 6, 2, 21,
  7]                                                                        ,
 [3, 10, 2, 7, 2, 1, 18, 2, 9, 2, 8, 3, 1, 6, 8, 2, 8, 17, 6, 8, 2, 1, 14,
  13, 3, 7, 14, 2, 8, 2, 7, 1, 3, 10, 2, 15, 1, 18, 2, 9, 2, 1, 2, 8, 3, 6,
  9, 2, 11, 15, 1, 19, 9, 2, 9, 2, 8, 12, 2, 9, 2, 12, 1, 20, 15, 1, 4, 25,
  6, 7, 1, 4, 8, 6, 16, 4, 3, 6, 5, 8]                                     ,
 ...,
 [3, 10, 2, 1, 2, 16, 19, 3, 15, 1, 12, 2, 7, 22, 1, 10, 4, 7, 1, 20, 2, 2,
  8, 1, 12, 5, 6, 8, 17, 1, 4, 1, 17, 5, 5, 12, 1, 24, 5, 20, 1, 3, 10, 5,
  13, 17, 10]                                                              ,
 [20, 13, 3, 1, 10, 2, 7, 1, 4, 1, 17, 13, 15, 1, 15, 5, 13, 12, 1, 18, 4,
  8, 3, 1, 3, 5, 1, 10, 4, 23, 2, 1, 4, 1, 20, 2, 2, 9, 1, 18, 6, 3, 1

In [297]:
# Print the ragged tensor returned
tf.shape(ragged_train)

<DynamicRaggedShape lengths=[37555, (20, 45, 80, 18, 75, 60, 84, 50, 75, 54, 69, 19, 52, 33, 46, 32, 19, 75, 43, 72, 14, 40, 58, 47, 86, 26, 34, 31, 38, 33, 46, 18, 27, 50, 56, 90, 90, 31, 20, 66, 57, 14, 58, 22, 72, 42, 32, 68, 47, 30, 48, 21, 45, 64, 31, 57, 55, 17, 70, 99, 39, 45, 44, 59, 38, 20, 67, 28, 15, 46, 57, 54, 53, 31, 95, 77, 82, 49, 41, 49, 73, 44, 25, 60, 81, 60, 24, 40, 76, 71, 56, 61, 64, 43, 18, 57, 40, 70, 80, 24, 60, 97, 95, 72, 32, 49, 60, 44, 79, 77, 13, 36, 64, 14, 76, 16, 49, 23, 70, 36, 46, 18, 96, 49, 52, 26, 95, 77, 15, 73, 29, 21, 96, 21, 46, 83, 57, 54, 60, 16, 30, 64, 19, 42, 67, 75, 30, 29, 41, 19, 22, 46, 39, 97, 59, 65, 41, 72, 81, 49, 32, 20, 70, 16, 44, 37, 78, 19, 42, 19, 70, 33, 84, 16, 24, 31, 51, 15, 60, 16, 37, 73, 24, 37, 29, 19, 13, 47, 15, 13, 55, 74, 38, 27, 42, 45, 63, 97, 11, 33, 40, 46, 64, 28, 38, 32, 66, 19, 31, 53, 53, 28, 40, 55, 83, 14, 17, 42, 67, 39, 38, 28, 26, 26, 69, 70, 57, 29, 56, 32, 24, 42, 26, 46, 45, 50, 62, 26, 35, 70, 46,

In [298]:
# Print the ragged label tensor returned

label_train

<tf.Tensor: shape=(37555,), dtype=int64, numpy=array([0, 1, 0, ..., 1, 1, 1])>

In [299]:
# Convert the train set into a tensorflow dataset


# Your code here
dataset_train = tf.data.Dataset.from_tensor_slices((ragged_train, label_train))
# Shuffle the dataset

# Your code here
dataset_train = dataset_train.shuffle(10000)
# Batch the dataset into batch size of 512

# Your code here
dataset_train = dataset_train.batch(512)
# Pad each input batch with 0s
dataset_train = dataset_train.map(lambda x,y: (x.to_tensor(default_value=0, shape=[None, None]), y), num_parallel_calls=5)

In [300]:
# Prefetching


# Your code here
dataset_train = dataset_train.prefetch(5)

In [301]:
for X_tr,y_tr in dataset_train.take(1):
    print("type : {} \n".format(type(X_tr)))
    print("Input Shape (batch_size,sequence length) : {} \n".format(X_tr.shape))

type : <class 'tensorflow.python.framework.ops.EagerTensor'> 

Input Shape (batch_size,sequence length) : (512, 99) 



2023-07-02 22:32:04.225672: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_1' with dtype int64 and shape [37555]
	 [[{{node Placeholder/_1}}]]
2023-07-02 22:32:04.226075: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_1' with dtype int64 and shape [37555]
	 [[{{node Placeholder/_1}}]]


In [302]:
# Look at a single example
X_tr

<tf.Tensor: shape=(512, 99), dtype=int32, numpy=
array([[ 6,  1, 14, ...,  0,  0,  0],
       [ 8,  5,  1, ...,  0,  0,  0],
       [ 6,  1, 24, ...,  0,  0,  0],
       ...,
       [ 7,  6, 16, ...,  0,  0,  0],
       [20, 13,  3, ...,  0,  0,  0],
       [18,  2, 11, ...,  0,  0,  0]], dtype=int32)>

In [303]:
# Index to word mapping

# Your code here
index_2_word = {v:k for k,v in tokenizer.word_index.items()}
print(type(tokenizer.word_index))
# Reconstruct the sentence using tokens
''.join([index_2_word[i] for i in X_tr[0].numpy() if i!=0])

<class 'dict'>


'i   c a n t   w a i t   t i l l   s l a v e r y   b e c o m e s   p o p u l a r   a g a i n'

In [304]:
# Checking sequence length counts across batches
seq_len = []
for X_tr, y_tr in dataset_train.as_numpy_iterator():
  seq_len.append(X_tr.shape[1])

# Checking the count of seq lengths
Counter(seq_len)

2023-07-02 22:32:04.405588: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_1' with dtype int64 and shape [37555]
	 [[{{node Placeholder/_1}}]]
2023-07-02 22:32:04.405949: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype variant and shape [37555]
	 [[{{node Placeholder/_0}}]]


Counter({99: 69, 98: 5})

## Defining the model and training

* Build a GRU model

In [305]:
# Build a GRU model
# Your code here
model_GRU = tf.keras.Sequential()
model_GRU.add(tf.keras.layers.Embedding(input_dim=100, output_dim=32, mask_zero=True))
model_GRU.add(tf.keras.layers.GRU(32))
model_GRU.add(tf.keras.layers.Dense(1, activation='sigmoid'))
#loss function and optimizer
model_GRU.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
history_GRU = model_GRU.fit(dataset_train, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


* Build a CNN + Deep, BiDirectional GRU Model

CNNs are good at learning spatial features, and sentences can be thought of as 1-D spatial vectors (dimensionality is determined by the number of words in the sentence). We can then take the features learned by the CNN (after a maxpooling layer) and feed them into an GRU! We expect the CNN to be able to pick out invariant features across the 1-D spatial structure (i.e., sentence) that characterize good and bad sentiment. This learned spatial features may then be learned as sequences by a reccurent layer. The classification step is then performed by a final dense layer.

In [306]:
# Build a CNN+DEEP, Birdirectional GRU model
# Your code here
model_CNN = tf.keras.Sequential()
model_CNN.add(tf.keras.layers.Embedding(input_dim=100, output_dim=32, mask_zero=True))
model_CNN.add(tf.keras.layers.Conv1D(32, 3, activation='relu'))
model_CNN.add(tf.keras.layers.MaxPooling1D(3))
model_CNN.add(tf.keras.layers.Bidirectional(tf.keras.layers.GRU(32)))
model_CNN.add(tf.keras.layers.Dense(1, activation='sigmoid'))
#loss function and optimizer
model_CNN.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
history_CNN = model_CNN.fit(dataset_train, epochs=5)

Epoch 1/5


2023-07-02 22:32:41.410858: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-07-02 22:32:41.412326: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-07-02 22:32:41.413917: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


## Evaluating the model

* Predict on the test data using the model trained above.
* Print the classification report to get the precision, recall and other metrics.
* Write down 5 sarcastic sentences yourself and use the model to predict if these sentences are sarcastic or not. Print out the probability of that sentence being sarcastic.

In [307]:
# Predict on the test set

# Your code here
y_pred_GRU = model_GRU.predict(ragged_test.to_tensor(default_value=0, shape=[None, None]))
y_pred_CNN = model_CNN.predict(ragged_test.to_tensor(default_value=0, shape=[None, None]))



2023-07-02 22:33:12.824462: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-07-02 22:33:12.827098: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-07-02 22:33:12.828301: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus



In [308]:
pred = np.where(y_pred_GRU > 0.5, 1, 0)

In [309]:
# Print the classification report comparing the true labels and the predicted ones
print(classification_report(label_test, pred))

              precision    recall  f1-score   support

           0       0.54      0.11      0.18      4423
           1       0.54      0.92      0.68      5073

    accuracy                           0.54      9496
   macro avg       0.54      0.51      0.43      9496
weighted avg       0.54      0.54      0.45      9496



In [310]:
pred = np.where(y_pred_CNN > 0.5, 1, 0)
print(type(y_pred_CNN))

<class 'numpy.ndarray'>


In [311]:
print(classification_report(label_test, pred))

              precision    recall  f1-score   support

           0       0.63      0.56      0.59      4423
           1       0.65      0.71      0.68      5073

    accuracy                           0.64      9496
   macro avg       0.64      0.64      0.64      9496
weighted avg       0.64      0.64      0.64      9496



In [312]:
eval_GRU = model_GRU.evaluate(ragged_test.to_tensor(default_value=0, shape=[None, None]), label_test)
print(eval_GRU)

[0.6880199313163757, 0.5420176982879639]


In [313]:
def process_sentence(sentence):
    processed_sentence = tf.expand_dims(tf.convert_to_tensor(tokenizer.texts_to_sequences(sentence)),axis=0)
    print('Shape of tokenized sentence:', (processed_sentence.shape))
    print('Tokenized sentence', processed_sentence)
    return processed_sentence

sentence = 'This is just for testing the sarcasm, does it sound funny'
tokenized_sentence = process_sentence(sentence)

ValueError: Can't convert non-rectangular Python sequence to Tensor.

In [None]:
# Predict sarcasm probabililty with the sentence

# Your code here