In [1]:
## import the tensorflow APIs
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
import pandas as pd
import math
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
data_sitcoms = pd.read_csv("mustard++_text.csv")

# Adjust sitcom data
data_sitcoms = data_sitcoms.drop(columns=['SCENE','KEY','END_TIME','SPEAKER','SHOW','Sarcasm_Type','Implicit_Emotion','Explicit_Emotion','Valence','Arousal'], axis=1)
data_sitcoms = data_sitcoms.rename(columns={'SENTENCE':'text','Sarcasm':'label'})

# remove empty label rows
for index, row in data_sitcoms.iterrows():
    if math.isnan(row['label']):
        data_sitcoms = data_sitcoms.drop(index, axis='index')

data_sitcoms.head()

Unnamed: 0,text,label
5,"And of those few months, how long have you bee...",0.0
14,"Let the dead man talk. So, why do you think that?",0.0
18,"What else? Sell it on eBay as ""slightly used.""",0.0
24,"Good idea, sit with her. Hold her, comfort her...",1.0
31,"Well, now that I've given up string theory, I'...",0.0


In [3]:
train_sentences = data_sitcoms["text"]
##train_sentences

In [4]:
##instantiate the tokenizer
tokenizer = Tokenizer(num_words=100)

##train the tokenizer on training sentences
tokenizer.fit_on_texts(train_sentences)

##store word index for the words in the sentence
word_index = tokenizer.word_index
##print(word_index)

In [5]:
##create sequences using tokenizer
sequences = tokenizer.texts_to_sequences(train_sentences)

In [6]:
##print word index dictionary and sequences
##print(f"Word index -->{word_index}")
##print(f"Sequences of words -->{sequences}")

In [7]:
##print sample sentence and sequence
train_sentences = train_sentences.to_numpy()
print(train_sentences[0])
print(sequences[0])

And of those few months, how long have you been a demented sex pervert?
[6, 8, 50, 24, 1, 4]


In [8]:
##set up the tokenizer again with oov_token
tokenizer = Tokenizer(num_words=100, oov_token = "<oov>")

##train the new tokenizer on training sentences
tokenizer.fit_on_texts(train_sentences)

##store word index for the words in the sentence
word_index = tokenizer.word_index

In [9]:
##pad sequences
padded_seqs = pad_sequences(sequences)

In [11]:
##print(word_index)
##print(train_sentences)
##print(sequences)
##print(padded_seqs)

In [13]:
##pad sequences with padding type, max length and truncating parameters
padded_seqs = pad_sequences(sequences,
                            padding="post",
                            maxlen=5,
                            truncating="post",
                            )
print(padded_seqs)

[[ 6  8 50 24  1]
 [ 3 29 59 27  1]
 [23  7 17 58  0]
 ...
 [31 72 89 67 42]
 [ 2  9 16  3  0]
 [ 9 42 57  0  0]]
