In [1]:
# corpus
example = [
    'This instrument measures the planes altitude',
    'This instrument measures the planes latitude'
]

### Tokenization with Out-of-Vocabulary Handling

Tokenization converts a text corpus into a sequence of words (tokens), enabling the model to work with numerical data. We also introduce an "out-of-vocabulary" (OOV) token to handle any words that the model hasn't seen during training.


In [2]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer

# Define the example corpus
example = [
    'This instrument measures the planes altitude',
    'This instrument measures the planes latitude'
]

# Updated tokenizer with out-of-vocabulary (OOV) handling
tokenizer = Tokenizer(num_words=100, oov_token='<OOV>')
tokenizer.fit_on_texts(example)
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(example)

print("Word Index:", word_index)
print("Sequences:", sequences)

Word Index: {'<OOV>': 1, 'this': 2, 'instrument': 3, 'measures': 4, 'the': 5, 'planes': 6, 'altitude': 7, 'latitude': 8}
Sequences: [[2, 3, 4, 5, 6, 7], [2, 3, 4, 5, 6, 8]]


### Out of vocab words

In [3]:
# new corpus
plane_status = [
    'This instrument measures the planes altitude',
    'This instrument measures the planes latitude',
    'Nothing is wrong everybody onboard is doing ok and enjoying dinner'
]
# tokenizer
tokenizer = Tokenizer(num_words = 100)
tokenizer.fit_on_texts(plane_status)
word_index = tokenizer.word_index

sequences = tokenizer.texts_to_sequences(plane_status)
print(word_index)
print(sequences)


{'this': 1, 'instrument': 2, 'measures': 3, 'the': 4, 'planes': 5, 'is': 6, 'altitude': 7, 'latitude': 8, 'nothing': 9, 'wrong': 10, 'everybody': 11, 'onboard': 12, 'doing': 13, 'ok': 14, 'and': 15, 'enjoying': 16, 'dinner': 17}
[[1, 2, 3, 4, 5, 7], [1, 2, 3, 4, 5, 8], [9, 6, 10, 11, 12, 6, 13, 14, 15, 16, 17]]


In [4]:
test_data = [
  'Nothing works! We were wrong about the altitude! ok what do we do now?',
  'What does this button on planes do? Are alarm sounds ok?'
]

In [5]:
test_sequences = tokenizer.texts_to_sequences(test_data)
print(word_index)
print(test_sequences)

{'this': 1, 'instrument': 2, 'measures': 3, 'the': 4, 'planes': 5, 'is': 6, 'altitude': 7, 'latitude': 8, 'nothing': 9, 'wrong': 10, 'everybody': 11, 'onboard': 12, 'doing': 13, 'ok': 14, 'and': 15, 'enjoying': 16, 'dinner': 17}
[[9, 10, 4, 7, 14], [1, 5, 14]]


translate back to:

"Nothing wrong, the altitude ok"

and 

"this planes ok"

Our AI is not very I right now... it is a good thing AI doesn't fly planes, am I right?

In [6]:
# Initialize tokenizer with oov_token
tokenizer = Tokenizer(num_words=100, oov_token='<OOV>')
tokenizer.fit_on_texts(example)
word_index = tokenizer.word_index

sequences = tokenizer.texts_to_sequences(example)
print("Word Index:", word_index)
print("Sequences:", sequences)


Word Index: {'<OOV>': 1, 'this': 2, 'instrument': 3, 'measures': 4, 'the': 5, 'planes': 6, 'altitude': 7, 'latitude': 8}
Sequences: [[2, 3, 4, 5, 6, 7], [2, 3, 4, 5, 6, 8]]


### Padding

### Let's load in some new text for fun

In [7]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [8]:
#review titles from amazon for my black shirt(s)
review1= "Wash by hand and line dry ONLY"
review2= "You can't go wrong with these premium polo shirts. Really nice texture and cool in the sun."
review3= "There are a few out there that don't snag easily but this is not one of those"
review4= "Great Shirt"

In [9]:
# new corpus
shirt_header = [review1, review2, review3, review4]
    
# tokenizer
tokenizer = Tokenizer(num_words = 100)
tokenizer.fit_on_texts(shirt_header)
word_index = tokenizer.word_index

sequences = tokenizer.texts_to_sequences(shirt_header)
print(word_index)
print(sequences)

{'and': 1, 'there': 2, 'wash': 3, 'by': 4, 'hand': 5, 'line': 6, 'dry': 7, 'only': 8, 'you': 9, "can't": 10, 'go': 11, 'wrong': 12, 'with': 13, 'these': 14, 'premium': 15, 'polo': 16, 'shirts': 17, 'really': 18, 'nice': 19, 'texture': 20, 'cool': 21, 'in': 22, 'the': 23, 'sun': 24, 'are': 25, 'a': 26, 'few': 27, 'out': 28, 'that': 29, "don't": 30, 'snag': 31, 'easily': 32, 'but': 33, 'this': 34, 'is': 35, 'not': 36, 'one': 37, 'of': 38, 'those': 39, 'great': 40, 'shirt': 41}
[[3, 4, 5, 1, 6, 7, 8], [9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 1, 21, 22, 23, 24], [2, 25, 26, 27, 28, 2, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39], [40, 41]]


In [10]:
padded = pad_sequences(sequences)
print(padded)

[[ 0  0  0  0  0  0  0  0  0  0  3  4  5  1  6  7  8]
 [ 9 10 11 12 13 14 15 16 17 18 19 20  1 21 22 23 24]
 [ 2 25 26 27 28  2 29 30 31 32 33 34 35 36 37 38 39]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 40 41]]


## Stopwords

### Remove HTML tags

In the code below we clean up HTML content by removing the HTML tags and leaving only the plain text.



In [11]:
bad_scraping_job = "<span>Wash by hand and line dry ONLY.</span>"

In [12]:
from bs4 import BeautifulSoup
soup =BeautifulSoup(bad_scraping_job)
clean_text = soup.get_text()
print(bad_scraping_job)
print(clean_text)

<span>Wash by hand and line dry ONLY.</span>
Wash by hand and line dry ONLY.


### Remove stop words using a list

Stopwords are common words (such as "the", "is", "in") that are often removed in natural language processing tasks to focus on more meaningful words.



In [13]:
my_stopword_list=['um', 'ah', 'er', 'eh', 'hmm', 'a', 'the']

In [14]:
example = 'um how about a er hmm turkey sandwich with ah cheese'

words = example.split()
filtered_example= ''
for word in words:
    if word not in my_stopword_list:
        filtered_example = filtered_example + word + " "
print(filtered_example)

how about turkey sandwich with cheese 


### Stopword Removal using NLTK

Stopwords are common words (such as "the", "is", "in") that are often removed in natural language processing tasks to focus on more meaningful words. In the above example we provided a stop word list, but they already exist and we can just use those if we want to. Below we use the NLTK library to remove these stopwords.

In [15]:
# note this package needs to be installed
import nltk
from nltk.corpus import stopwords

# Download stopwords
nltk.download('stopwords')

# Use stopwords from nltk
stop_words = set(stopwords.words('english'))

example = 'um how about a er hmm turkey sandwich with ah cheese'
words = example.split()

filtered_example = ' '.join([word for word in words if word.lower() not in stop_words])
print(filtered_example)


ModuleNotFoundError: No module named 'nltk'