### Imports


In [22]:
import nltk
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer

# Download required NLTK resources
nltk.download('punkt_tab', force=True)
nltk.download("stopwords")
nltk.download("wordnet")

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/elloyd/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/elloyd/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/elloyd/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [18]:
from nltk.tokenize import word_tokenize
from nltk.tokenize.punkt import PunktSentenceTokenizer

tokenizer = PunktSentenceTokenizer()
tokens = tokenizer.tokenize("Sample sentence for testing.")
print(tokens)

['Sample sentence for testing.']


### Data load


In [23]:

# Load text from a file
file_path = "NLP.txt"  # Change this to your file name
with open(file_path, "r", encoding="utf-8") as file:
    text = file.read()


### Data Prep


In [25]:

# 1. Tokenization
tokens = word_tokenize(text)

# 2. Convert to lowercase & Remove punctuation
tokens = [word.lower() for word in tokens if word.isalnum()]

# 3. Remove stop words
stop_words = set(stopwords.words("english"))
filtered_tokens = [word for word in tokens if word not in stop_words]

# 4. Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]


### Bag of Words


In [26]:

# 5. Convert to Bag of Words (BoW)
vectorizer = CountVectorizer()
bow_matrix = vectorizer.fit_transform([" ".join(lemmatized_tokens)])

# Print Results
print("\nCleaned Tokens:", lemmatized_tokens)
print("\nBag of Words Features:", vectorizer.get_feature_names_out())
print("\nBag of Words Matrix:\n", bow_matrix.toarray())



Bag of Words Features: ['10x' '2017' 'ability' 'able' 'academic' 'accelerate' 'accordingly'
 'accuracy' 'act' 'add' 'adjust' 'adopt' 'adoption' 'advance' 'advanced'
 'affect' 'aggressively' 'ago' 'ahead' 'ai' 'aligned' 'already' 'also'
 'although' 'ambitious' 'analyst' 'analytics' 'analyzing' 'answer'
 'answering' 'anthony' 'anticipate' 'appeared' 'applicable' 'application'
 'apps' 'archive' 'area' 'around' 'article' 'artificial' 'asset' 'assist'
 'assistant' 'attention' 'automate' 'automates' 'automation' 'autonlp'
 'avoid' 'avoided' 'aware' 'banking' 'barrier' 'based' 'basic' 'become'
 'begin' 'beginning' 'benefit' 'best' 'bet' 'better' 'block' 'blog' 'boat'
 'bottom' 'bound' 'brainstorming' 'branch' 'breakthrough' 'broad'
 'business' 'call' 'called' 'capability' 'capable' 'capital' 'capitalize'
 'carry' 'cautious' 'certainly' 'chain' 'change' 'changing' 'chief'
 'classifying' 'code' 'codex' 'coding' 'cognitive' 'cognizant'
 'combination' 'commercial' 'common' 'company' 'competitio