In [1]:
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
# data split into feature and target
train = pd.read_csv('D:\project\AWS\Emotional_analysis_NLP\data\processed\\train.csv')
train.head()

Unnamed: 0,text,label
0,ive blabbed enough tonight im tired ive feelin...,0
1,woke really early morning drove feel ecstatic ...,1
2,feel never gave rest day megabrick feeling stu...,3
3,feeling restless teary flat sad strange today,4
4,feel like im doomed ive even began,0


In [27]:
X_train = train['text'].astype(str)
y_train = train['label']

In [18]:
test = pd.read_csv('D:\project\AWS\Emotional_analysis_NLP\data\processed\\test.csv')
test.head()

Unnamed: 0,text,label
0,id say maybe made feel foolish would reeeeeeal...,0
1,joined lds church admit feeling somewhat asham...,0
2,must admit didnt feel like hugging angry disgu...,3
3,hate still feel nerve damaged badly enough oft...,0
4,im actually feeling little smug,1


In [28]:
X_test = test['text'].astype(str)
y_test = test['label']

In [24]:
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

X_train shape: (333447,)
X_test shape: (83362,)
y_train shape: (333447,)
y_test shape: (83362,)


In [25]:
 X_train

0         ive blabbed enough tonight im tired ive feelin...
1         woke really early morning drove feel ecstatic ...
2         feel never gave rest day megabrick feeling stu...
3             feeling restless teary flat sad strange today
4                        feel like im doomed ive even began
                                ...                        
333442                               feel love fell beloved
333443    realized often time isnt reaction started feel...
333444                                         feel envious
333445    im still impatient frequently irritable time i...
333446                                 feel weird apartment
Name: text, Length: 333447, dtype: object

# Tokenization and padding seq

1. Tokenization: The text data in X_train and X_test is tokenized using the Tokenizer class from Keras. This step converts the text data into sequences of integers, where each unique word in the dataset is assigned a unique integer index. The num_words parameter limits the vocabulary size to 50,000 most frequent words.
1. Padding: After tokenization, the sequences in X_train and X_test are padded to ensure uniform length. This is achieved using the pad_sequences function, which pads sequences with zeros to make them all of the same length (maxlen). Padding is done after the sequences to ensure that the actual content of the text is preserved.
Overall, the code prepares the text data for further processing and modeling by converting it into tokenized and padded sequences, making it suitable for use in machine learning algorithms such as neural networks

In [29]:
# Tokenize the text data
tokenizer = Tokenizer(num_words=50000)
tokenizer.fit_on_texts(X_train)
tokenizer.fit_on_texts(X_test)


In [30]:
tokenizer.document_count

416809

In [31]:
X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_test_sequences = tokenizer.texts_to_sequences(X_test)

In [32]:
# Max Len in X_train_sequences
maxlen = max(len(tokens) for tokens in X_train_sequences)
print("Maximum sequence length (maxlen):", maxlen)

Maximum sequence length (maxlen): 79


In [33]:
# Perform padding on X_train and X_test sequences
X_train_padded = pad_sequences(X_train_sequences, maxlen=maxlen, padding='post')
X_test_padded = pad_sequences(X_test_sequences, maxlen=maxlen, padding='post')

In [34]:
# Print the padded sequences for X_train and X_test
print("X_train_padded:")
print(X_train_padded)
print("\nX_test_padded:")
print(X_test_padded)

X_train_padded:
[[   20 26719    73 ...     0     0     0]
 [  275     5   738 ...     0     0     0]
 [    1    45   735 ...     0     0     0]
 ...
 [    1   606     0 ...     0     0     0]
 [    4    18   476 ...     0     0     0]
 [    1   149  1686 ...     0     0     0]]

X_test_padded:
[[  123    33   172 ...     0     0     0]
 [ 2941 12893   748 ...     0     0     0]
 [  194   310    48 ...     0     0     0]
 ...
 [  851   276  1128 ...     0     0     0]
 [ 1874  6225   645 ...     0     0     0]
 [   83     1   401 ...     0     0     0]]


In [35]:
print(f'X_train shape: {X_train_padded.shape}')
print(f'X_test shape: {X_test_padded.shape}')

X_train shape: (333447, 79)
X_test shape: (83362, 79)


In [38]:
len(tokenizer.index_word)

67634

# vocalbulary size

In [39]:
# Embedding Vocabulary Size 
vocabulary_size = len(set(token for sequence in X_train_padded for token in sequence))
vocabulary_size

49510