In [3]:
import tensorflow as tf
from tensorflow import keras
from keras import layers
from tensorflow.keras.preprocessing.text import Tokenizer

If you have a sentence called I love my dog! it wont consider it as a different word because the Tokenizer helps to remove punctuations <br>
It will always lowercase the letters

In [7]:
sentences = ["I love my dog",
             "I love my cat",
             "You love my dog!",
             "Do you think my dog is amazing?"]

tokenizer = Tokenizer(num_words=10)
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index

sequences = tokenizer.texts_to_sequences(sentences)

print(word_index)
print(sequences)


{'my': 1, 'love': 2, 'dog': 3, 'i': 4, 'you': 5, 'cat': 6, 'do': 7, 'think': 8, 'is': 9, 'amazing': 10}
[[4, 2, 1, 3], [4, 2, 1, 6], [5, 2, 1, 3], [7, 5, 8, 1, 3, 9]]


as you can see below the test data should have the same corpus from the training data or else it wouldnt encode properly

In [8]:
test_data = [
    "i really love my dog",
    "my dog loves my manatee"
]

test_seq = tokenizer.texts_to_sequences(test_data)
print(test_seq)

[[4, 2, 1, 3], [1, 3, 1]]


We need to add a special token for words that are out of our training vocab

In [13]:
tokenizer = Tokenizer(num_words=100, oov_token="<OOV>")
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index

print(word_index)

test_seq = tokenizer.texts_to_sequences(test_data)
print(test_seq)

{'<OOV>': 1, 'my': 2, 'love': 3, 'dog': 4, 'i': 5, 'you': 6, 'cat': 7, 'do': 8, 'think': 9, 'is': 10, 'amazing': 11}
[[5, 1, 3, 2, 4], [2, 4, 1, 2, 1]]


Time to add padding to make sure that each sentence has a uniform size

In [16]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

sentences = ["I love my dog",
             "I love my cat",
             "You love my dog!",
             "Do you think my dog is amazing?"]

tokenizer = Tokenizer(num_words=100, oov_token="<OOV>")
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index

sequences = tokenizer.texts_to_sequences(sentences)
print("Before Padding:")
print(sequences)

padded_sequences = pad_sequences(sequences)
print("After Sequences:")
print(padded_sequences)

Before Padding:
[[5, 3, 2, 4], [5, 3, 2, 7], [6, 3, 2, 4], [8, 6, 9, 2, 4, 10, 11]]
After Sequences:
[[ 0  0  0  5  3  2  4]
 [ 0  0  0  5  3  2  7]
 [ 0  0  0  6  3  2  4]
 [ 8  6  9  2  4 10 11]]


post means that it will add the numebr of 0's at the end and if you set the maxlen to 5 it will remove the words from the start. To override this you will use the truncating parameter to post.

In [24]:
padded_sequences_edit = pad_sequences(sequences, padding="post",
                                      maxlen=5, truncating="post")
print(padded_sequences_edit)

[[5 3 2 4 0]
 [5 3 2 7 0]
 [6 3 2 4 0]
 [8 6 9 2 4]]


In [61]:
def remove_stopwords(sentence):
    # List of stopwords
    stopwords = ["a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", "did", "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", "have", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's", "its", "itself", "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought", "our", "ours", "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", "so", "some", "such", "than", "that", "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "we'd", "we'll", "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves" ]
    
    # Sentence converted to lowercase-only
    sentence = str(sentence).lower()
    
    ### START CODE HERE
    words = sentence.split()
    result = [word for word in words if word not in stopwords]
    sentence = " ".join(result)
    
   
    ### END CODE HERE
    return sentence

In [62]:
import requests
import json

dataset_url = "https://storage.googleapis.com/tensorflow-1-public/course3/sarcasm.json"
response = requests.get(dataset_url)

if (response.status_code == 200):
    data = response.json()

    # print(json.dumps(data, indent=4))
else:
    print(f"Failed to retrieve the JSON file. Status code: {response.status_code}")

sentences = []
labels = []
url = []
for item in data:
    sentences.append(remove_stopwords(item["headline"]))
    labels.append(item["is_sarcastic"])
    url.append(item["article_link"])

In [64]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index

print("Total Unique Words:",len(word_index))

sequences = tokenizer.texts_to_sequences(sentences)
padded = pad_sequences(sequences, padding="post")

index = 2
print(f'sample headline: {sentences[index]}')
print(f'padded sequence: {padded[index]}')
print()

# Print dimensions of padded sequences

Total Unique Words: 29583
sample headline: mom starting fear son's web series closest thing will grandchild
padded sequence: [   80   738   809  1644  1989   481  4625   143     7 10654     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0]

