In [None]:
import string
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
# from bs4 import BeautifulSoup as bs
# import lxml

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
def preprocess_text(text):

    #Original text
    print("Original:")
    print(text)
    print()
 
    #Lowercase the text
    text = text.lower()
    print("Lowercased:")
    print(text)
    print()
 
    #Number Removal
    text = re.sub(r'[-+]?\d+', '', text)
    print("Integer Numbers removed:")
    print(text)
    print()
 
    #Remove hyperlinks
    text = re.sub(r'https?:\/\/\S*', '', text)
    #text = re.sub(r'https?:\/\/.*[\r\n]*', '', text)
    #text = re.sub(r'https?:\/\/.*\s*', '', text)
    text = re.sub(r'www\.\S*', '', text)
    text = re.sub(r'\S*\.(com|info|net|org)', '', text)
    print("Sample Hyperlinks removed:")
    print(text)
    print()
 
    #Remove punctuations
    text = text.translate((str.maketrans('', '', string.punctuation)))
    print("Sample Punctuations removed:")
    print(text)
    print()
 
    #Tokenize
    text = word_tokenize(text)
    print("Tokenized(word-level):")
    print(text)
    print()
 
    #Remove stopwords
    stop_words = set(stopwords.words('english'))
    text = [word for word in text if not word in stop_words]
    print("Sample Stopwords removed:")
    print(text)
    print()

    #Lemmatize tokens
    lemmatizer = WordNetLemmatizer()
    text = [lemmatizer.lemmatize(word) for word in text]
    print("Tokens Lemmatized:")
    print(text)
    print()

    #Stemming tokens
    stemmer = PorterStemmer()
    text = [stemmer.stem(word) for word in text]
    print("Tokens Stemmed:")
    print(text)
    print()
    
    return text

In [None]:
texts = [
         "You shall not pass.",
         "The axe forgets but the tree remembers.",
         "What is dead may never die.",
         "Thou shall not cheat.",
         "Everybody lies.",
         "The lone wolf dies, but the pack survives."
]

In [None]:
preprocessed_texts = list(map(preprocess_text, texts))

In [None]:
preprocessed_texts

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
def fit_vectorizer(vectorizer_class, texts):
  print(f"Vectorizing with {vectorizer_class}")
  print()

  texts_vectorizer = vectorizer_class(analyzer=lambda text: text)
  texts_vectorizer.fit(texts)
    
  # Printing the identified Unique words along with their indices
  print("Vocabulary:")
  print(texts_vectorizer.vocabulary_)
  print()

  return texts_vectorizer

def vectorize_texts(texts_vectorizer, texts):

  # Encode the Document
  texts_vectors = texts_vectorizer.transform(texts).toarray()
    
  # Summarizing the Encoded Texts
  print("Encoded Documents:")
  print(texts_vectors)

  return texts_vectors

In [None]:
vectorizer = fit_vectorizer(CountVectorizer, texts=preprocessed_texts)
vectorize_texts(vectorizer, texts=preprocessed_texts)

Vectorizing with <class 'sklearn.feature_extraction.text.CountVectorizer'>

Vocabulary:
{'shall': 14, 'pa': 11, 'axe': 0, 'forget': 6, 'tree': 17, 'rememb': 13, 'dead': 2, 'may': 9, 'never': 10, 'die': 3, 'thou': 16, 'cheat': 1, 'everybodi': 5, 'lie': 7, 'lone': 8, 'wolf': 18, 'dy': 4, 'pack': 12, 'surviv': 15}

Encoded Documents:
[[0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0]
 [1 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 1 0]
 [0 0 1 1 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0]
 [0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0]
 [0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 1 0 0 0 1 0 0 0 1 0 0 1 0 0 1]]


array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0],
       [0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1]])

In [None]:
vectorize_texts(vectorizer, texts=[preprocess_text("Everybody dies some day.")])

Original:
Everybody dies some day.

Lowercased:
everybody dies some day.

Integer Numbers removed:
everybody dies some day.

Sample Hyperlinks removed:
everybody dies some day.

Sample Punctuations removed:
everybody dies some day

Tokenized(word-level):
['everybody', 'dies', 'some', 'day']

Sample Stopwords removed:
['everybody', 'dies', 'day']

Tokens Lemmatized:
['everybody', 'dy', 'day']

Tokens Stemmed:
['everybodi', 'dy', 'day']

Encoded Documents:
[[0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0]]


array([[0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [None]:
vectorizer = fit_vectorizer(TfidfVectorizer, texts=preprocessed_texts)
vectorize_texts(vectorizer, texts=preprocessed_texts)
vectorize_texts(vectorizer, texts=[preprocess_text("Everybody dies some day.")])

Vectorizing with <class 'sklearn.feature_extraction.text.TfidfVectorizer'>

Vocabulary:
{'shall': 14, 'pa': 11, 'axe': 0, 'forget': 6, 'tree': 17, 'rememb': 13, 'dead': 2, 'may': 9, 'never': 10, 'die': 3, 'thou': 16, 'cheat': 1, 'everybodi': 5, 'lie': 7, 'lone': 8, 'wolf': 18, 'dy': 4, 'pack': 12, 'surviv': 15}

Encoded Documents:
[[0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.77326237
  0.         0.         0.6340862  0.         0.         0.
  0.        ]
 [0.5        0.         0.         0.         0.         0.
  0.5        0.         0.         0.         0.         0.
  0.         0.5        0.         0.         0.         0.5
  0.        ]
 [0.         0.         0.5        0.5        0.         0.
  0.         0.         0.         0.5        0.5        0.
  0.         0.         0.         0.         0.         0.
  0.        ]
 [0.         0.61171251 0.         0.         0.         0.
  0.         0. 

array([[0.        , 0.        , 0.        , 0.        , 0.70710678,
        0.70710678, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        ]])