In [4]:
# Part-of -Speech tagging
import nltk
text = "Text preprocessing is an important step in natural language processing.\
       It involves cleaning and transforming raw text data into a format \
       suitable for analysis and machine learning models."
       

tokens = nltk.word_tokenize(text)
pos_tags = nltk.pos_tag(tokens)

print(pos_tags)

[('Text', 'NNP'), ('preprocessing', 'NN'), ('is', 'VBZ'), ('an', 'DT'), ('important', 'JJ'), ('step', 'NN'), ('in', 'IN'), ('natural', 'JJ'), ('language', 'NN'), ('processing', 'NN'), ('.', '.'), ('It', 'PRP'), ('involves', 'VBZ'), ('cleaning', 'VBG'), ('and', 'CC'), ('transforming', 'VBG'), ('raw', 'JJ'), ('text', 'NN'), ('data', 'NNS'), ('into', 'IN'), ('a', 'DT'), ('format', 'NN'), ('suitable', 'JJ'), ('for', 'IN'), ('analysis', 'NN'), ('and', 'CC'), ('machine', 'NN'), ('learning', 'NN'), ('models', 'NNS'), ('.', '.')]


In [6]:
# Text encoding/ Vectorization
from sklearn.feature_extraction.text import TfidfVectorizer

# Sample documents
documents = [
    "This is the first document.",
    "This document is the second document.",
    "And this is the third one.",
    "Is this the first document?",
]

# Create a TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the documents
tfidf_matrix = vectorizer.fit_transform(documents)

# Get feature names (words) from the vectorizer
feature_names = vectorizer.get_feature_names_out()

# Convert the TF-IDF matrix to a dense array and display the results
dense_array = tfidf_matrix.toarray()
print("TF-IDF Matrix:")
print(dense_array)

# Display feature names
print("\nFeature Names:")
print(feature_names)

TF-IDF Matrix:
[[0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]
 [0.         0.6876236  0.         0.28108867 0.         0.53864762
  0.28108867 0.         0.28108867]
 [0.51184851 0.         0.         0.26710379 0.51184851 0.
  0.26710379 0.51184851 0.26710379]
 [0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]]

Feature Names:
['and' 'document' 'first' 'is' 'one' 'second' 'the' 'third' 'this']


In [8]:
# Padding/Truncation
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Sample texts
texts = [
    "This is the first document.",
    "This document is the second document.",
    "And this is the third one.",
    "Is this the first document?",
]

# Create a tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)

# Convert texts to sequences
sequences = tokenizer.texts_to_sequences(texts)

# Define a maximum sequence length
max_length = 10

# Perform padding or truncation
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post', truncating='post')

# Display the results
print("Original Sequences:")
print(sequences)
print("\nPadded/Truncated Sequences:")
print(padded_sequences)

Original Sequences:
[[1, 2, 3, 5, 4], [1, 4, 2, 3, 6, 4], [7, 1, 2, 3, 8, 9], [2, 1, 3, 5, 4]]

Padded/Truncated Sequences:
[[1 2 3 5 4 0 0 0 0 0]
 [1 4 2 3 6 4 0 0 0 0]
 [7 1 2 3 8 9 0 0 0 0]
 [2 1 3 5 4 0 0 0 0 0]]
