In [1]:
#Install nltk
!pip install nltk




[notice] A new release of pip is available: 23.2.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import nltk

In [3]:
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

# Text Cleaning

In [1]:
#Creating bunch of sentences
raw_docs = ["I am writing some very basic english sentences",
"I'm just writing it for the demo PURPOSE to make audience understand the basics .",
"The point is to _learn HOW it works_ on #simple # data."]

## Step 1 - convert to lower case

In [2]:
raw_docs = [doc.lower() for doc in raw_docs]
print(raw_docs)

['i am writing some very basic english sentences', "i'm just writing it for the demo purpose to make audience understand the basics .", 'the point is to _learn how it works_ on #simple # data.']


## Step 2 - Tokenization

In [3]:
print("word tokenize")

from nltk.tokenize import word_tokenize
tokenized_docs = [word_tokenize(doc) for doc in raw_docs]
print(tokenized_docs)

print("\nSentence tokenization")

from nltk.tokenize import sent_tokenize
sent_token = [sent_tokenize(doc) for doc in raw_docs]
print(sent_token)

word tokenize
[['i', 'am', 'writing', 'some', 'very', 'basic', 'english', 'sentences'], ['i', "'m", 'just', 'writing', 'it', 'for', 'the', 'demo', 'purpose', 'to', 'make', 'audience', 'understand', 'the', 'basics', '.'], ['the', 'point', 'is', 'to', '_learn', 'how', 'it', 'works_', 'on', '#', 'simple', '#', 'data', '.']]

Sentence tokenization
[['i am writing some very basic english sentences'], ["i'm just writing it for the demo purpose to make audience understand the basics ."], ['the point is to _learn how it works_ on #simple # data.']]


## Step 3 - Punctuation Removal

In [4]:
import string
import re

regex = re.compile('[%s]' % re.escape(string.punctuation))
#This line creates a regular expression pattern to match any character that is included in the string.punctuation constant. 
#re.escape() is used to escape any special characters within the punctuation string .

tokenized_docs_no_punctuation = []

for review in tokenized_docs:
    new_review = []
    for token in review:
        new_token = regex.sub(u'', token) # substitute any punctuation characters in the current token (token) with an empty string (''). This effectively removes all punctuation from the token.
        if not new_token == u'': # checks if the token after removing punctuation is not an empty string.
            new_review.append(new_token)
    
    tokenized_docs_no_punctuation.append(new_review)
    
print(tokenized_docs_no_punctuation)

[['i', 'am', 'writing', 'some', 'very', 'basic', 'english', 'sentences'], ['i', 'm', 'just', 'writing', 'it', 'for', 'the', 'demo', 'purpose', 'to', 'make', 'audience', 'understand', 'the', 'basics'], ['the', 'point', 'is', 'to', 'learn', 'how', 'it', 'works', 'on', 'simple', 'data']]


## Step 4 - Removing Stopwords

In [5]:
# Cleaning text of stopwords
from nltk.corpus import stopwords

tokenized_docs_no_stopwords = []

for doc in tokenized_docs_no_punctuation:
    new_term_vector = []
    for word in doc:
        if not word in stopwords.words('english'):
            new_term_vector.append(word)
    
    tokenized_docs_no_stopwords.append(new_term_vector)

print(tokenized_docs_no_stopwords)

[['writing', 'basic', 'english', 'sentences'], ['writing', 'demo', 'purpose', 'make', 'audience', 'understand', 'basics'], ['point', 'learn', 'works', 'simple', 'data']]


In [9]:
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

## Step 5- Stemming and Lemmatization

In [8]:
# Stemming and Lemmatization
from nltk.stem.porter import PorterStemmer # imports the PorterStemmer class from NLTK, which is used for stemming.
from nltk.stem.wordnet import WordNetLemmatizer #  imports the WordNetLemmatizer class from NLTK, which is used for lemmatization.

porter = PorterStemmer() #Creates an instance of the PorterStemmer class
wordnet = WordNetLemmatizer() #Creates an instance of the WordNetLemmatizer class

preprocessed_docs = []

for doc in tokenized_docs_no_stopwords:
    final_doc = []
    for word in doc:
        #final_doc.append(porter.stem(word)) #stems the current word using the Porter Stemmer and appends the stemmed form to the final_doc list.
        final_doc.append(wordnet.lemmatize(word)) # lemmatizes the current word using the WordNet Lemmatizer and appends the lemmatized form to the final_doc list.
    
    preprocessed_docs.append(final_doc) #After processing all words in a document, the resulting list of stemmed or lemmatized words (final_doc) is appended to the preprocessed_docs list.

print(preprocessed_docs)

[['writing', 'basic', 'english', 'sentence'], ['writing', 'demo', 'purpose', 'make', 'audience', 'understand', 'basic'], ['point', 'learn', 'work', 'simple', 'data']]


# Feature Extraction

In [22]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Convert preprocessed documents to strings
doc_strings = [' '.join(doc) for doc in preprocessed_docs]
print(doc_strings)

# Bag of Words (BoW) method
bow_vectorizer = CountVectorizer()
bow_matrix = bow_vectorizer.fit_transform(doc_strings)

# TF-IDF method
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(doc_strings)

# Vocabulary for reference
print("\nVocabulary for Bag of Words (BoW):")
print(bow_vectorizer.get_feature_names_out())
#print(bow_vectorizer.vocabulary_) #This dictionary maps each word to its index in the feature matrix
print("\nVocabulary for TF-IDF:")
print(tfidf_vectorizer.get_feature_names_out())

# Print BoW and TF-IDF matrices
print("\nBag of Words (BoW) matrix:")
print(bow_matrix.toarray())
print("\nTF-IDF matrix:")
print(tfidf_matrix.toarray())


['writing basic english sentence', 'writing demo purpose make audience understand basic', 'point learn work simple data']

Vocabulary for Bag of Words (BoW):
['audience' 'basic' 'data' 'demo' 'english' 'learn' 'make' 'point'
 'purpose' 'sentence' 'simple' 'understand' 'work' 'writing']

Vocabulary for TF-IDF:
['audience' 'basic' 'data' 'demo' 'english' 'learn' 'make' 'point'
 'purpose' 'sentence' 'simple' 'understand' 'work' 'writing']

Bag of Words (BoW) matrix:
[[0 1 0 0 1 0 0 0 0 1 0 0 0 1]
 [1 1 0 1 0 0 1 0 1 0 0 1 0 1]
 [0 0 1 0 0 1 0 1 0 0 1 0 1 0]]

TF-IDF matrix:
[[0.         0.42804604 0.         0.         0.5628291  0.
  0.         0.         0.         0.5628291  0.         0.
  0.         0.42804604]
 [0.40301621 0.30650422 0.         0.40301621 0.         0.
  0.40301621 0.         0.40301621 0.         0.         0.40301621
  0.         0.30650422]
 [0.         0.         0.4472136  0.         0.         0.4472136
  0.         0.4472136  0.         0.         0.4472136  