# **Tokenizer**

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer

sentences = [
    'i love my dog',
    'I, love my cat',
    'You love my dog!'
    'Do you think my dog is amazing?'
]

tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
print(word_index)

{'my': 1, 'love': 2, 'dog': 3, 'i': 4, 'you': 5, 'cat': 6, 'do': 7, 'think': 8, 'is': 9, 'amazing': 10}


# **Padding**

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
sentences = [
    'I love my dog',
    'I love my cat',
    'You love my dog!',
    'Do you think my dog is amazing?'
]

In [None]:
tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index

sequences = tokenizer.texts_to_sequences(sentences)

padded = pad_sequences(sequences, maxlen=5)
print("\nWord Index = " , word_index)
print("\nSequences = " , sequences)
print("\nPadded Sequences:")
print(padded)


Word Index =  {'<OOV>': 1, 'my': 2, 'love': 3, 'dog': 4, 'i': 5, 'you': 6, 'cat': 7, 'do': 8, 'think': 9, 'is': 10, 'amazing': 11}

Sequences =  [[5, 3, 2, 4], [5, 3, 2, 7], [6, 3, 2, 4, 8, 6, 9, 2, 4, 10, 11]]

Padded Sequences:
[[ 0  5  3  2  4]
 [ 0  5  3  2  7]
 [ 9  2  4 10 11]]


In [None]:
# Try with words that the tokenizer wasn't fit to
test_data = [
    'i really love my dog',
    'my dog loves my house much'
]

test_seq = tokenizer.texts_to_sequences(test_data)
print("\nTest Sequence = ", test_seq)

padded = pad_sequences(test_seq, maxlen=10,padding='pre')
print("\nPadded Test Sequence: ")
print(padded)


Test Sequence =  [[5, 1, 3, 2, 4], [2, 4, 1, 2, 1, 1]]

Padded Test Sequence: 
[[0 0 0 0 0 5 1 3 2 4]
 [0 0 0 0 2 4 1 2 1 1]]


#Stopword Removal

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download resources
nltk.download('punkt')
nltk.download('stopwords')
# Download punkt_tab specifically
nltk.download('punkt_tab')


text = "This is a simple example to demonstrate stopword removal."

# Tokenize
words = word_tokenize(text)

# Remove stopwords
stop_words = set(stopwords.words('english'))
filtered_words = [word for word in words if word.lower() not in stop_words]

print("Original:", words)
print("Without Stopwords:", filtered_words)

Original: ['This', 'is', 'a', 'simple', 'example', 'to', 'demonstrate', 'stopword', 'removal', '.']
Without Stopwords: ['simple', 'example', 'demonstrate', 'stopword', 'removal', '.']


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


#Stemming

Used to reduce words to their root or base form, often called a stem.

In [None]:
from nltk.stem import PorterStemmer

ps = PorterStemmer()
words = ["running", "runs", "easily", "fairly"]

stemmed_words = [ps.stem(word) for word in words]
print("Stemmed Words:", stemmed_words)


Stemmed Words: ['run', 'run', 'easili', 'fairli']


#Lemmatization

It's used to tranform the words to their base dictionary form

In [None]:
from nltk.stem import WordNetLemmatizer

# Download WordNet
nltk.download('wordnet')
nltk.download('omw-1.4')

lemmatizer = WordNetLemmatizer()
words = ["running", "better", "easily", "fairly"]

lemmatized_words = [lemmatizer.lemmatize(word, pos='v') for word in words]  # 'v' = verb
print("Lemmatized Words:", lemmatized_words)


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


Lemmatized Words: ['run', 'better', 'easily', 'fairly']


#n-gram precision

In [None]:
from nltk.util import ngrams

text = "I love natural language processing"
tokens = text.split()

# Generate bigrams (n=2)
bigrams = list(ngrams(tokens, 2))
print("Bigrams:", bigrams)

# Generate trigrams (n=3)
trigrams = list(ngrams(tokens, 3))
print("Trigrams:", trigrams)


Bigrams: [('I', 'love'), ('love', 'natural'), ('natural', 'language'), ('language', 'processing')]
Trigrams: [('I', 'love', 'natural'), ('love', 'natural', 'language'), ('natural', 'language', 'processing')]
