### Tokenize and sequence a bigger corpus of text

In [2]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
print(tf.__version__)

2.13.0


### Get the corpus of text

In [17]:
import os
import pandas as pd
import numpy as np

data_path = '/home/login/Documents/Machine_learning/Datasets/reviews/reviews.csv'
df = pd.read_csv(data_path)\

df.sample(1)

Unnamed: 0.1,Unnamed: 0,text,sentiment
1335,1335,It's worth driving up from Tucson!,1


In [18]:
df.drop('Unnamed: 0', axis = 1, inplace = True)
df.sample(1)

Unnamed: 0,text,sentiment
1264,You can't beat that.,1


Get the reviews from text column

In [21]:
reviews = df['text'].tolist()
reviews[0]

'So there is no way for me to plug it in here in the US unless I go by a converter.'

### Tokenize the text

In [28]:
# tokenizer = Tokenizer(num_words = 100, oov_token = "<OOV>")
tokenizer = Tokenizer(oov_token = "<OOV>")

In [30]:
tokenizer.fit_on_texts(reviews)
word_index = tokenizer.word_index
print(len(word_index))
print(word_index)

3261


### Generate sequences for the reviews

In [34]:
sequences = tokenizer.texts_to_sequences(reviews)
print(sequences)

[[28, 59, 8, 56, 142, 13, 61, 7, 269, 6, 15, 46, 15, 2, 149, 449, 4, 60, 113, 5, 1429], [18, 110, 87, 397], [19, 13, 2, 732], [1430, 7, 166, 13, 733, 969, 90, 74, 970, 125, 1431, 248], [2, 606, 8, 19], [4, 23, 7, 1432, 2, 269, 7, 80, 6, 7, 508, 53, 150, 7, 80, 450, 249], [47, 24, 23, 292, 1433, 76, 292, 1434, 734, 121, 971, 2, 509, 11, 607, 398, 11, 119, 41, 113, 41], [47, 24, 31, 510, 608, 24, 270, 23, 10], [735, 7, 137, 4, 736, 14, 126], [81, 5, 167, 11, 126, 3, 39], [3, 2, 73, 43, 8, 19], [155, 9, 17, 182, 77, 134, 48, 2, 451, 71, 7, 2, 972, 71], [47, 2, 143, 42, 1435, 113, 5, 1436, 183, 1437, 4, 399, 7, 973, 1438, 737, 3, 1439, 73, 48, 2, 68], [17, 18, 43, 400], [2, 271, 8, 17, 1440, 35, 2, 96, 609, 8, 12, 17, 168, 36, 34], [229, 88, 13, 103, 41, 272, 62, 5, 511, 974, 20], [4, 975, 401, 107, 12, 32, 1441], [28, 156, 28, 18], [65, 19], [6, 1442, 250, 27, 15, 5, 142, 21, 273, 24, 1443, 135, 175, 21, 1444, 44, 214], [4, 157, 22, 1445, 738, 3, 1446, 34, 1447, 29, 104, 12, 80, 6, 7, 739

In [67]:
padded_sequences = pad_sequences(sequences, padding = 'post')
print(len(padded_sequences[1, :]))
print(padded_sequences.shape)
print(padded_sequences)

139
(1992, 139)
[[ 28  59   8 ...   0   0   0]
 [ 18 110  87 ...   0   0   0]
 [ 19  13   2 ...   0   0   0]
 ...
 [239   4   9 ...   0   0   0]
 [  2 644 140 ...   0   0   0]
 [121  35  47 ...   0   0   0]]


In [68]:
padded_sequences2 = pad_sequences(sequences, padding = 'post', maxlen = 20, truncating = 'pre')
print(len(padded_sequences2[1, :]))
print(padded_sequences2)

20
[[  59    8   56 ...  113    5 1429]
 [  18  110   87 ...    0    0    0]
 [  19   13    2 ...    0    0    0]
 ...
 [ 239    4    9 ...    0    0    0]
 [   2  644  140 ...    0    0    0]
 [  14  197   59 ...  605    2  375]]


In [70]:
print(reviews[0])
print(padded_sequences[0])

So there is no way for me to plug it in here in the US unless I go by a converter.
[  28   59    8   56  142   13   61    7  269    6   15   46   15    2
  149  449    4   60  113    5 1429    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0]
